package org.opencms.util; import java.io.Serializable; import org.htmlparser.Node; import org.htmlparser.Tag; import org.htmlparser.Text; import org.htmlparser.tags.LinkTag; import org.htmlparser.util.Translate; import org.htmlparser.visitors.NodeVisitor; /** * Extracts the HTML page content.<p> */ public class StringBean extends NodeVisitor implements Serializable { /** * A newline. */ private static final String NEWLINE = System.getProperty("line.separator"); /** * The length of the NEWLINE. */ private static final int NEWLINE_SIZE = NEWLINE.length(); private static final long serialVersionUID = 1596190888769126925L; /** * The buffer text is stored in while traversing the HTML. */ protected StringBuffer m_buffer; /** * If <code>true</code> sequences of whitespace characters are replaced * with a single space character. */ protected boolean m_collapse; /** * Set <code>true</code> when traversing a PRE tag. */ protected boolean m_isPre; /** * Set <code>true</code> when traversing a SCRIPT tag. */ protected boolean m_isScript; /** * Set <code>true</code> when traversing a STYLE tag. */ protected boolean m_isStyle; /** * If <code>true</code> the link URLs are embedded in the text output. */ protected boolean m_links; /** * The strings extracted from the URL. */ protected String m_strings; /** * Create a StringBean object. * Default property values are set to 'do the right thing': * <p><code>Links</code> is set <code>false</code> so text appears like a * browser would display it, albeit without the colour or underline clues * normally associated with a link.</p> * <p><code>ReplaceNonBreakingSpaces</code> is set <code>true</code>, so * that printing the text works, but the extra information regarding these * formatting marks is available if you set it false.</p> * <p><code>Collapse</code> is set <code>true</code>, so text appears * compact like a browser would display it.</p> */ public StringBean() { super(true, true); m_strings = null; m_links = false; m_collapse = true; m_buffer = new StringBuffer(4096); m_isScript = false; m_isPre = false; m_isStyle = false; } /** * Get the current 'collapse whitespace' state. * If set to <code>true</code> this emulates the operation of browsers * in interpretting text where <quote>user agents should collapse input * white space sequences when producing output inter-word space</quote>. * See HTML specification section 9.1 White space * <a href="http://www.w3.org/TR/html4/struct/text.html#h-9.1"> * http://www.w3.org/TR/html4/struct/text.html#h-9.1</a>. * @return <code>true</code> if sequences of whitespace (space '\u0020', * tab '\u0009', form feed '\u000C', zero-width space '\u200B', * carriage-return '\r' and NEWLINE '\n') are to be replaced with a single * space. */ public boolean getCollapse() { return (m_collapse); } /** * Get the current 'include links' state. * @return <code>true</code> if link text is included in the text extracted * from the URL, <code>false</code> otherwise. */ public boolean getLinks() { return (m_links); } /** * Return the textual contents of the URL. * This is the primary output of the bean. * @return The user visible (what would be seen in a browser) text. */ public String getStrings() { if (null == m_strings) { if (0 == m_buffer.length()) { setStrings(); } else { updateStrings(m_buffer.toString()); } } return (m_strings); } /** * Set the current 'collapse whitespace' state. * If the setting is changed after the URL has been set, the text from the * URL will be reacquired, which is possibly expensive. * @param collapse If <code>true</code>, sequences of whitespace * will be reduced to a single space. */ public void setCollapse(boolean collapse) { boolean oldValue = m_collapse; if (oldValue != collapse) { m_collapse = collapse; setStrings(); } } /** * Set the 'include links' state. * If the setting is changed after the URL has been set, the text from the * URL will be reacquired, which is possibly expensive. * @param links Use <code>true</code> if link text is to be included in the * text extracted from the URL, <code>false</code> otherwise. */ public void setLinks(boolean links) { boolean oldValue = m_links; if (oldValue != links) { m_links = links; setStrings(); } } /** * Resets the state of the PRE and SCRIPT flags. * @param tag The end tag to process. */ @Override public void visitEndTag(Tag tag) { Node parent = tag.getParent(); if (parent instanceof LinkTag) { if (getLinks()) { // appends the link as text between angle brackets to the output. m_buffer.append(" <"); m_buffer.append(((LinkTag)parent).getLink()); m_buffer.append(">"); } } String name = tag.getTagName().toUpperCase(); if (name.equals("PRE")) { m_isPre = false; } else if (name.equals("SCRIPT")) { m_isScript = false; } else if (name.equals("STYLE")) { m_isStyle = false; } if (isHeadTag(name)) { carriageReturn(); carriageReturn(true); } if (isTitleTag(name)) { m_buffer.append(" ]"); carriageReturn(); carriageReturn(true); } } private boolean isTitleTag(String name) { return "TITLE".equals(name); } private boolean isHeadTag(String name) { return "H1".equals(name) || "H2".equals(name) || "H3".equals(name) || "H4".equals(name) || "H5".equals(name) || "H6".equals(name); } /** * Appends the text to the output. * @param string The text node. */ @Override public void visitStringNode(Text string) { if (!m_isScript && !m_isStyle) { String text = string.getText(); if (!m_isPre) { text = Translate.decode(text); text = text.replace('\u00a0', ' '); if (getCollapse()) { collapse(m_buffer, text); } else { m_buffer.append(text); } } else { m_buffer.append(text); } } } /** * Appends a NEWLINE to the output if the tag breaks flow, and * possibly sets the state of the PRE and SCRIPT flags. * @param tag The tag to examine. */ @Override public void visitTag(Tag tag) { String name = tag.getTagName(); if (name.equalsIgnoreCase("PRE")) { m_isPre = true; } else if (name.equalsIgnoreCase("SCRIPT")) { m_isScript = true; } else if (name.equalsIgnoreCase("STYLE")) { m_isStyle = true; } if (isHeadTag(name)) { carriageReturn(true); m_buffer.append("* "); } else if (isTitleTag(name)) { m_buffer.append("[ "); } else { if (tag.breaksFlow()) { carriageReturn(); } } } /** * Appends a newline to the buffer if there isn't one there already. * Except if the buffer is empty. */ protected void carriageReturn() { carriageReturn(false); } /** * Appends a newline to the buffer if there isn't one there already. * Except if the buffer is empty. * * @param check a parameter the developer forgot to comment */ protected void carriageReturn(boolean check) { int length; length = m_buffer.length(); if ((0 != length) // don't append newlines to the beginning of a buffer && (check || ((NEWLINE_SIZE <= length) // not enough chars to hold a NEWLINE && (!m_buffer.substring(length - NEWLINE_SIZE, length).equals(NEWLINE))))) { m_buffer.append(NEWLINE); } } /** * Add the given text collapsing whitespace. * Use a little finite state machine: * <pre> * state 0: whitepace was last emitted character * state 1: in whitespace * state 2: in word * A whitespace character moves us to state 1 and any other character * moves us to state 2, except that state 0 stays in state 0 until * a non-whitespace and going from whitespace to word we emit a space * before the character: * input: whitespace other-character * state\next * 0 0 2 * 1 1 space then 2 * 2 1 2 * </pre> * @param buffer The buffer to append to. * @param string The string to append. */ protected void collapse(StringBuffer buffer, String string) { int chars; int length; int state; char character; chars = string.length(); if (0 != chars) { length = buffer.length(); state = ((0 == length) || (buffer.charAt(length - 1) == ' ') || ((NEWLINE_SIZE <= length) && buffer.substring( length - NEWLINE_SIZE, length).equals(NEWLINE))) ? 0 : 1; for (int i = 0; i < chars; i++) { character = string.charAt(i); switch (character) { // see HTML specification section 9.1 White space // http://www.w3.org/TR/html4/struct/text.html#h-9.1 case '\u0020': case '\u0009': case '\u000C': case '\u200B': case '\r': case '\n': if (0 != state) { state = 1; } break; default: if (1 == state) { buffer.append(' '); } state = 2; buffer.append(character); } } } } /** * Fetch the URL contents. * Only do work if there is a valid parser with it's URL set. */ protected void setStrings() { m_strings = null; m_buffer = new StringBuffer(4096); } /** * Assign the <code>Strings</code> property, firing the property change. * @param strings The new value of the <code>Strings</code> property. */ protected void updateStrings(String strings) { if ((null == m_strings) || !m_strings.equals(strings)) { m_strings = strings; } } }