/* * To change this template, choose Tools | Templates * and open the template in the editor. */ package org.albite.book.model.parser; import java.util.Vector; import org.albite.albite.AlbiteMIDlet; import org.albite.book.view.StylingConstants; import org.albite.io.html.HTMLSubstitues; import org.albite.io.html.XhtmlStreamReader; ////#define DEBUG_PARSER /** * * This is a <i>very</i> simple HTML parser made for the specific purpose of * parsing HTMLs on the fly, i.e. without conversion. It preserves some of the * formatting: the one specified by the following tags: * - [meta encoding] * - [i] or [em] * - [b] or [strong] * - [center], [left], [right] * * Because of memory considerations, and File linking issues, * images will not be preserved. * * @author albus */ public class HTMLTextParser extends TextParser implements HTMLSubstitues, StylingConstants { private static final String TAG_P = "p"; private static final String TAG_BR = "br"; private static final String TAG_DIV = "div"; private static final String TAG_TR = "tr"; private static final String TAG_LI = "li"; private static final String TAG_IMG = "img"; private static final String TAG_SVG_IMAGE = "image"; private static final String TAG_B = "b"; private static final String TAG_STRONG = "strong"; private static final String TAG_I = "i"; private static final String TAG_EM = "em"; private static final String TAG_H1 = "h1"; private static final String TAG_H2 = "h2"; private static final String TAG_H3 = "h3"; private static final String TAG_H4 = "h4"; private static final String TAG_H5 = "h5"; private static final String TAG_H6 = "h6"; private static final String TAG_CENTER = "center"; private static final String TAG_HR = "hr"; private static final String TAG_PRE = "pre"; private int ignoreTag = 0; private int pre = 0; private int bold = 0; private int italic = 0; private int heading = 0; private int center = 0; private boolean hr = false; private Vector instructions = new Vector(20); public HTMLTextParser() { processBreaks = false; } public final void reset() { ignoreTag = 0; pre = 0; bold = 0; italic = 0; heading = 0; center = 0; hr = false; if (instructions != null) { instructions.removeAllElements(); } super.reset(); } public final boolean parseNext( final char[] text, final int textSize) { //#ifdef DEBUG_PARSER //# AlbiteMIDlet.LOGGER.log("---------------\nParsing: " + text.length + " / " + textSize); //#endif if (!instructions.isEmpty()) { /* * Execute instructions before continuing; */ //#ifdef DEBUG_PARSER //# AlbiteMIDlet.LOGGER.log("Executing pareser instructions"); //#endif state = ((Integer) instructions.lastElement()).byteValue(); instructions.removeElementAt(instructions.size() - 1); return true; } if (!proceed(textSize)) { return false; } if (processWhiteSpace(position, text, textSize)) { return true; } //Parse markup instructions if (parseMarkup(text, textSize)) { return true; } //#ifdef DEBUG_PARSER //# AlbiteMIDlet.LOGGER.log("No markup"); //#endif /* * parsing normal text; stopping at stop-chars or end of textbuffer */ state = (ignoreTag > 0 ? STATE_PASS : STATE_TEXT); for (int i = position; i < textSize; i++) { ch = text[i]; if (isWhiteSpace(ch) || isNewLine(ch) || ch == START_TAG_CHAR) { length = i - position; //#ifdef DEBUG_PARSER //# AlbiteMIDlet.LOGGER.log("Stop character."); //#endif return true; } } //#ifdef DEBUG_PARSER //# AlbiteMIDlet.LOGGER.log("end of 'parseNext'"); //#endif length = textSize - position; state = STATE_TEXT; return true; } private boolean parseMarkup(final char[] text, final int textSize) { int pos = position; boolean terminatingTag = false; /* * At least one char for tags */ //#ifdef DEBUG_PARSER //# AlbiteMIDlet.LOGGER.log("Trying markup: " + textSize + ", " + pos); //#endif if (textSize > pos && text[pos] == START_TAG_CHAR) { //#ifdef DEBUG_PARSER //# AlbiteMIDlet.LOGGER.log("parsing markup..."); //#endif state = STATE_PASS; /* * check if it's a comment tag */ if (pos + 3 < textSize) { if ( text[pos + 1] == '!' && text[pos + 2] == '-' && text[pos + 3] == '-') { /* * It's indeed a comment tag */ position = pos + 4; length = 0; while (position < textSize) { if (text[position] == END_TAG_CHAR && text[position - 1] == '-' && text[position - 2] == '-') { /* * End of comment */ position++; break; } position++; } /* * end of comment (no matter closed or not) */ return true; } } /* * Let's start parsing tags content (if such exists) */ pos++; if (pos >= textSize) { /* * It was a single '<' character dangling at the end of the file */ position = pos; return true; } if (text[pos] == '/') { terminatingTag = true; pos++; } /* * back to position */ position = pos; if (text.length <= pos) { return false; } for (int i = pos; i < textSize; i++) { ch = text[i]; if (ch == END_TAG_CHAR) { length = i - position + 1; /* * Parse the name */ int len = length - 1; int max = position + length - 1; for (int k = position; k < max; k++) { ch = text[k]; if (isWhiteSpace(ch) || isNewLine(ch) || ch == '/') { len = k - position; break; } } final String name = new String(text, position, len); //#ifdef DEBUG_PARSER //# AlbiteMIDlet.LOGGER.log("tag: _" + new String(text, position, length) + "_"); //# AlbiteMIDlet.LOGGER.log("tag name: _" + name + "_"); //# //# if (length + position < textSize) { //# AlbiteMIDlet.LOGGER.log("next char to read after this: _" + text[length + position] + "_, " + ((int) text[length + position])); //# } //#endif if (TAG_IMG.equalsIgnoreCase(name)) { /* * Image */ final String attributes = new String( text, position + len, length - 1 - len); final int[] srcPositions = XhtmlStreamReader.readAttribute( attributes, "src"); if (srcPositions == null) { imageURLPosition = 0; imageURLLength = 0; } else { imageURLPosition = position + len + srcPositions[0]; imageURLLength = srcPositions[1]; } final int[] altPositions = XhtmlStreamReader.readAttribute( attributes, "alt"); if (altPositions == null) { imageTextPosition = 0; imageTextLength = 0; } else { imageTextPosition = position + len + altPositions[0]; imageTextLength = altPositions[1]; } state = STATE_IMAGE; return true; } if (TAG_SVG_IMAGE.equalsIgnoreCase(name)) { /* * SVG Image */ final String attributes = new String( text, position + len, length - 1 - len); final int[] srcPositions = XhtmlStreamReader.readAttribute( attributes, "xlink:href"); if (srcPositions == null) { imageURLPosition = 0; imageURLLength = 0; } else { imageURLPosition = position + len + srcPositions[0]; imageURLLength = srcPositions[1]; } imageTextPosition = 0; imageTextLength = 0; state = STATE_IMAGE; return true; } /* * Obviously, the image tag won't affect the `hr` variable */ final boolean hrOld = hr; hr = false; if (TAG_BR.equalsIgnoreCase(name)) { /* * New line */ state = STATE_NEW_LINE; return true; } if (TAG_P.equalsIgnoreCase(name) || TAG_DIV.equalsIgnoreCase(name) || TAG_TR.equalsIgnoreCase(name) || TAG_LI.equalsIgnoreCase(name)) { /* * New line */ //#ifdef DEBUG_PARSER //# AlbiteMIDlet.LOGGER.log("executed: <P>"); //#endif state = STATE_NEW_SOFT_LINE; return true; } if (TAG_HR.equalsIgnoreCase(name)) { /* * Horizontal ruler */ hr = true; if (!hrOld) { instructions.addElement(new Integer(STATE_NEW_SOFT_LINE)); instructions.addElement(new Integer(STATE_RULER)); instructions.addElement(new Integer(STATE_NEW_SOFT_LINE)); } state = STATE_PASS; return true; } if (terminatingTag) { if (TAG_B.equalsIgnoreCase(name) || TAG_STRONG.equalsIgnoreCase(name)) { bold--; if (bold <= 0) { bold = 0; disableBold = true; state = STATE_STYLING; } else { state = STATE_PASS; } return true; } if (TAG_I.equalsIgnoreCase(name) || TAG_EM.equalsIgnoreCase(name)) { italic--; if (italic <= 0) { italic = 0; disableItalic = true; state = STATE_STYLING; } else { state = STATE_PASS; } return true; } if (TAG_H1.equalsIgnoreCase(name) || TAG_H2.equalsIgnoreCase(name) || TAG_H3.equalsIgnoreCase(name) || TAG_H4.equalsIgnoreCase(name) || TAG_H5.equalsIgnoreCase(name) || TAG_H6.equalsIgnoreCase(name)) { heading--; if (heading <= 0) { heading = 0; disableHeading = true; instructions.addElement(new Integer(STATE_STYLING)); } state = STATE_NEW_SOFT_LINE; return true; } if (TAG_CENTER.equalsIgnoreCase(name)) { center--; if (center <= 0) { center = 0; disableCenterAlign = true; instructions.addElement(new Integer(STATE_STYLING)); } state = STATE_NEW_SOFT_LINE; return true; } if (TAG_PRE.equalsIgnoreCase(name)) { pre--; if (pre <= 0) { pre = 0; processBreaks = false; } state = STATE_PASS; return true; } if (isIgnoreTag(name)) { ignoreTag--; if (ignoreTag < 0) { ignoreTag = 0; } return true; } } else { if (TAG_B.equalsIgnoreCase(name) || TAG_STRONG.equalsIgnoreCase(name)) { bold++; enableBold = true; state = STATE_STYLING; return true; } if (TAG_I.equalsIgnoreCase(name) || TAG_EM.equalsIgnoreCase(name)) { italic++; enableItalic = true; state = STATE_STYLING; return true; } if (TAG_H1.equalsIgnoreCase(name) || TAG_H2.equalsIgnoreCase(name) || TAG_H3.equalsIgnoreCase(name) || TAG_H4.equalsIgnoreCase(name) || TAG_H5.equalsIgnoreCase(name) || TAG_H6.equalsIgnoreCase(name)) { heading++; enableHeading = true; instructions.addElement(new Integer(STATE_NEW_SOFT_LINE)); state = STATE_STYLING; return true; } if (TAG_CENTER.equalsIgnoreCase(name)) { center++; enableCenterAlign = true; instructions.addElement(new Integer(STATE_NEW_SOFT_LINE)); state = STATE_STYLING; return true; } if (TAG_PRE.equalsIgnoreCase(name)) { int k = position + length + 1; if (k < textSize) { if (text[k] == '\n') { length += 2; } else if (text[k] == '\r') { length += 2; k++; if (k < textSize && text[k] == '\n') { length++; } } } pre++; processBreaks = true; state = STATE_PASS; return true; } if (isIgnoreTag(name)) { ignoreTag++; return true; } } return true; } } /* * TODO: Do not know if next line is OK. */ position = textSize; length = 1; return true; } return false; } private static boolean isIgnoreTag(final String s) { return "head".equalsIgnoreCase(s) || "style".equalsIgnoreCase(s) || "form".equalsIgnoreCase(s) || "frameset".equalsIgnoreCase(s) || "map".equalsIgnoreCase(s) || "script".equalsIgnoreCase(s) || "object".equalsIgnoreCase(s) || "applet".equalsIgnoreCase(s) || "noscript".equalsIgnoreCase(s) ; } }