FilterVisitor.java example

Explorer
OmegaT-master
/**************************************************************************
 OmegaT - Computer Assisted Translation (CAT) tool
          with fuzzy matching, translation memory, keyword search,
          glossaries, and translation leveraging into updated projects.

 Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk
               2007-2008 Didier Briel, Martin Fleurke
               2010 Didier Briel
               2011 Didier Briel, Martin Fleurke
               2012 Didier Briel, Martin Fleurke
               2013 Didier Briel, Alex Buloichik
               Home page: http://www.omegat.org/
               Support center: http://groups.yahoo.com/group/OmegaT/

 This file is part of OmegaT.

 OmegaT is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 OmegaT is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 **************************************************************************/

package org.omegat.filters2.html2;

import java.io.BufferedWriter;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.TreeMap;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.htmlparser.Attribute;
import org.htmlparser.Node;
import org.htmlparser.Remark;
import org.htmlparser.Tag;
import org.htmlparser.Text;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.visitors.NodeVisitor;
import org.omegat.core.Core;
import org.omegat.util.OStrings;
import org.omegat.util.PatternConsts;
import org.omegat.util.StringUtil;

/**
 * The part of HTML filter that actually does the job. This class is called back
 * by HTMLParser (http://sf.net/projects/htmlparser/).
 *
 * @author Maxym Mykhalchuk
 * @author Didier Briel
 * @author Henry Pijffers (henry.pijffers@saxnot.com)
 * @author Martin Fleurke
 */
public class FilterVisitor extends NodeVisitor {
    protected HTMLFilter2 filter;
    private BufferedWriter writer;
    private HTMLOptions options;

    public FilterVisitor(HTMLFilter2 htmlfilter, BufferedWriter bufwriter, HTMLOptions opts) {
        this.filter = htmlfilter;
        // HHC filter has no options
        if (opts != null) {
            this.options = opts;
        } else {
            // To prevent a null pointer exception later, see https://sourceforge.net/p/omegat/bugs/651/
            this.options = new HTMLOptions(new TreeMap<String, String>());
        }
        this.writer = bufwriter;
    }

    // ///////////////////////////////////////////////////////////////////////
    // Variable declaration
    // ///////////////////////////////////////////////////////////////////////

    /** Should the parser call us for this tag's ending tag and its inner tags. */
    boolean recurse = true;

    /** Do we collect the translatable text now. */
    boolean text = false;
    /** The translatable text being collected. */
    // StringBuffer paragraph;
    /** Did the PRE block start (it means we mustn't compress the spaces). */
    boolean preformatting = false;

    /**
     * The list of non-paragraph tags before a chunk of text.
     * <ul>
     * <li>If a chunk of text follows, they get prepended to the translatable
     * paragraph, (starting from the first tag having a pair inside a chunk of
     * text)
     * <li>Otherwise they are written out directly.
     * </ul>
     */
    List<Node> befors;

    /** The list of nodes forming a chunk of text. */
    List<Node> translatable;

    /**
     * The list of non-paragraph tags following a chunk of text.
     * <ul>
     * <li>If another chunk of text follows, they get appended to the
     * translatable paragraph,
     * <li>Otherwise (paragraph tag follows), they are written out directly.
     * </ul>
     */
    List<Node> afters;

    /** The tags behind the shortcuts */
    List<Tag> sTags;
    /** The tag numbers of shorcutized tags */
    List<Integer> sTagNumbers;
    /** The list of all the tag shortcuts */
    List<String> sShortcuts;
    /** The number of shortcuts stored */
    int sNumShortcuts;

    /**
     * Self traversal predicate.
     *
     * @return <code>true</code> if a node itself is to be visited.
     */
    @Override
    public boolean shouldRecurseSelf() {
        return recurse;
    }

    /**
     * Depth traversal predicate.
     *
     * @return <code>true</code> if children are to be visited.
     */
    @Override
    public boolean shouldRecurseChildren() {
        return recurse;
    }

    /**
     * Called for each <code>Tag</code> visited.
     *
     * @param tag
     *            The tag being visited.
     */
    @Override
    public void visitTag(Tag tag) {

        boolean intactTag = isIntactTag(tag);

        if (!intactTag) { // If it's an intact tag, no reason to check
            // Decide whether this tag should be intact, based on the key-value pairs stored in the
            // configuration
            Vector<?> tagAttributes = tag.getAttributesEx();
            Iterator<?> i = tagAttributes.iterator();
            while (i.hasNext() && !intactTag) {
                Attribute attribute = (Attribute) i.next();
                String name = attribute.getName();
                String value = attribute.getValue();
                if (name == null || value == null) {
                    continue;
                }
                intactTag = this.filter.checkIgnoreTags(name, value);
            }
        }

        if (intactTag) {
            if (text) {
                endup();
            } else {
                flushbefors();
            }
            writeout(tag.toHtml());
            if (tag.getEndTag() != null) {
                recurse = false;
            }
        } else {
            // recurse = true;
            if (isParagraphTag(tag) && text) {
                endup();
            }
            if (isPreformattingTag(tag) || Core.getFilterMaster().getConfig().isPreserveSpaces()) {
                preformatting = true;
            }
            // Translate attributes of tags if they are not null.
            maybeTranslateAttribute(tag, "abbr");
            maybeTranslateAttribute(tag, "alt");
            if (options.getTranslateHref()) {
                maybeTranslateAttribute(tag, "href");
            }
            if (options.getTranslateHreflang()) {
                maybeTranslateAttribute(tag, "hreflang");
            }
            if (options.getTranslateLang()) {
                maybeTranslateAttribute(tag, "lang");
                maybeTranslateAttribute(tag, "xml:lang");
            }
            maybeTranslateAttribute(tag, "label");
            if ("IMG".equals(tag.getTagName()) && options.getTranslateSrc()) {
                maybeTranslateAttribute(tag, "src");
            }
            maybeTranslateAttribute(tag, "summary");
            maybeTranslateAttribute(tag, "title");
            if ("INPUT".equals(tag.getTagName())) { //an input element
                if (options.getTranslateValue() //and we translate all input elements
                        || options.getTranslateButtonValue() // or we translate submit/button/reset elements ...
                                && ("submit".equalsIgnoreCase(tag.getAttribute("type"))
                                        || "button".equalsIgnoreCase(tag.getAttribute("type"))
                                        || "reset".equalsIgnoreCase(tag.getAttribute("type"))
                           ) //and it is a submit/button/reset element.
                   ) {
                    //then translate the value
                    maybeTranslateAttribute(tag, "value");
                }
                maybeTranslateAttribute(tag, "placeholder");
            }
            // Special handling of meta-tag: depending on the other attributes
            // the contents-attribute should or should not be translated.
            // The group of attribute-value pairs indicating non-translation
            // are stored in the configuration
            if ("META".equals(tag.getTagName())) {
                Vector<?> tagAttributes = tag.getAttributesEx();
                Iterator<?> i = tagAttributes.iterator();
                boolean doSkipMetaTag = false;
                while (i.hasNext() && !doSkipMetaTag) {
                    Attribute attribute = (Attribute) i.next();
                    String name = attribute.getName();
                    String value = attribute.getValue();
                    if (name == null || value == null) {
                        continue;
                    }
                    doSkipMetaTag = this.filter.checkDoSkipMetaTag(name, value);
                }
                if (!doSkipMetaTag) {
                    maybeTranslateAttribute(tag, "content");
                }
            }

            queuePrefix(tag);
        }
    }

    /**
     * If the attribute of the tag is not empty, it translates it as a separate
     * segment.
     *
     * @param tag
     *            the tag object
     * @param key
     *            the name of the attribute
     */
    protected void maybeTranslateAttribute(Tag tag, String key) {
        String attr = tag.getAttribute(key);
        if (attr != null) {
            String comment = OStrings.getString("HTMLFILTER_TAG") + " " + tag.getTagName() + " "
                    + OStrings.getString("HTMLFILTER_ATTRIBUTE") + " " + key;
            String trans = filter.privateProcessEntry(entitiesToChars(attr), comment);
            tag.setAttribute(key, charsToEntities(trans));
        }
    }

    boolean firstcall = true;

    /**
     * Called for each chunk of text (<code>StringNode</code>) visited.
     *
     * @param string
     *            The string node being visited.
     */
    @Override
    public void visitStringNode(Text string) {
        recurse = true;
        // nbsp is special case - process it like usual spaces
        String trimmedtext = entitiesToChars(string.getText()).replace((char) 160, ' ').trim();
        if (!trimmedtext.isEmpty()) {
            // Hack around HTMLParser not being able to handle XHTML
            // RFE pending:
            // http://sourceforge.net/tracker/index.php?func=detail&aid=1227222&group_id=24399&atid=381402
            if (firstcall && PatternConsts.XML_HEADER.matcher(trimmedtext).matches()) {
                writeout(string.toHtml());
                return;
            }

            text = true;
            firstcall = false;
        }

        if (text) {
            queueTranslatable(string);
        } else {
            queuePrefix(string);
        }
    }

    /**
     * Called for each comment (<code>RemarkNode</code>) visited.
     *
     * @param remark
     *            The remark node being visited.
     */
    @Override
    public void visitRemarkNode(Remark remark) {
        recurse = true;
        if (text) {
            endup();
        }
        if (!options.getRemoveComments()) {
            writeout(remark.toHtml());
        }
    }

    /**
     * Called for each end <code>Tag</code> visited.
     *
     * @param tag
     *            The end tag being visited.
     */
    @Override
    public void visitEndTag(Tag tag) {
        recurse = true;
        if (isParagraphTag(tag) && text) {
            endup();
        }
        if (isPreformattingTag(tag)) {
            preformatting = false;
        }
        queuePrefix(tag);
    }

    /**
     * This method is called before the parsing.
     */
    @Override
    public void beginParsing() {
        cleanup();
    }

    /**
     * Called upon parsing completion.
     */
    @Override
    public void finishedParsing() {
        if (text) {
            endup();
        } else {
            flushbefors();
        }
    }

    /**
     * Does the tag lead to starting (ending) a paragraph.
     * <p>
     * Contains code donated by JC to have dictionary list parsed as segmenting.
     *
     * @see <a href="https://sourceforge.net/p/omegat/feature-requests/102/">RFE
     *      #102</a>
     */
    private boolean isParagraphTag(Tag tag) {
        String tagname = tag.getTagName();
        return
        // Bugfix for https://sourceforge.net/p/omegat/bugs/84/
        // ADDRESS tag is also a paragraph tag
        tagname.equals("ADDRESS") || tagname.equals("BLOCKQUOTE") || tagname.equals("BODY")
                || tagname.equals("CENTER") || tagname.equals("DIV") || tagname.equals("H1")
                || tagname.equals("H2") || tagname.equals("H3") || tagname.equals("H4")
                || tagname.equals("H5") || tagname.equals("H6") || tagname.equals("HTML")
                || tagname.equals("HEAD") || tagname.equals("TITLE") || tagname.equals("TABLE")
                || tagname.equals("TR") || tagname.equals("TD") || tagname.equals("TH")
                || tagname.equals("P") || tagname.equals("PRE") || tagname.equals("OL")
                || tagname.equals("UL")
                || tagname.equals("LI")
                ||
                // Added by JC to have dictionary list parsed as segmenting.
                tagname.equals("DL") || tagname.equals("DT")
                || tagname.equals("DD")
                ||
                // End of JC's contribution
                tagname.equals("FORM") || tagname.equals("TEXTAREA") || tagname.equals("FIELDSET")
                || tagname.equals("LEGEND") || tagname.equals("LABEL") || tagname.equals("SELECT")
                || tagname.equals("OPTION") || tagname.equals("HR")
                // Optional paragraph on BR
                || (tagname.equals("BR") && options.getParagraphOnBr());

    }

    /** Should a contents of this tag be kept intact? */
    private boolean isIntactTag(Tag tag) {
        String tagname = tag.getTagName();
        return tagname.equals("!DOCTYPE")
                || tagname.equals("STYLE")
                || tagname.equals("SCRIPT")
                || tagname.equals("OBJECT")
                || tagname.equals("EMBED")
                || (tagname.equals("META") && "content-type".equalsIgnoreCase(tag.getAttribute("http-equiv")));
    }

    /** Is the tag space-preserving? */
    private boolean isPreformattingTag(Tag tag) {
        String tagname = tag.getTagName();
        return tagname.equals("PRE") || tagname.equals("TEXTAREA");
    }

    /** Writes something to writer. */
    private void writeout(String something) {
        try {
            writer.write(something);
        } catch (IOException ioe) {
            System.out.println(ioe);
        }
    }

    /**
     * Ends the segment collection and sends the translatable text out to OmegaT
     * core, and some extra tags to writer.
     */
    protected void endup() {
        // detecting the first starting tag in 'befors'
        // that has its ending in the paragraph
        // all before this "first good" are simply written out
        List<Node> all = new ArrayList<Node>();
        all.addAll(befors);
        all.addAll(translatable);
        int firstgoodlimit = befors.size();
        int firstgood = 0;
        while (firstgood < firstgoodlimit) {
            Node goodNode = all.get(firstgood);
            if (!(goodNode instanceof Tag)) {
                firstgood++;
                continue;
            }
            Tag good = (Tag) goodNode;

            // trying to test
            int recursion = 1;
            boolean found = false;
            for (int i = firstgood + 1; i < all.size(); i++) {
                Node candNode = all.get(i);
                if (candNode instanceof Tag) {
                    Tag cand = (Tag) candNode;
                    if (cand.getTagName().equals(good.getTagName())) {
                        if (!cand.isEndTag()) {
                            recursion++;
                        } else {
                            recursion--;
                            if (recursion == 0) {
                                if (i >= firstgoodlimit) {
                                    found = true;
                                }
                                // we've found an ending tag for this "good one"
                                break;
                            }
                        }
                    }
                }
            }
            // if we could find an ending,
            // this is a "good one"
            if (found) {
                break;
            }
            firstgood++;
        }

        // detecting the last ending tag in 'afters'
        // that has its starting in the paragraph
        // all after this "last good" is simply writen out
        int lastgoodlimit = all.size() - 1;
        all.addAll(afters);
        int lastgood = all.size() - 1;
        while (lastgood > lastgoodlimit) {
            Node goodNode = all.get(lastgood);
            if (!(goodNode instanceof Tag)) {
                lastgood--;
                continue;
            }
            Tag good = (Tag) goodNode;

            // trying to test
            int recursion = 1;
            boolean found = false;
            for (int i = lastgood - 1; i >= firstgoodlimit; i--) {
                Node candNode = all.get(i);
                if (candNode instanceof Tag) {
                    Tag cand = (Tag) candNode;
                    if (cand.getTagName().equals(good.getTagName())) {
                        if (cand.isEndTag()) {
                            recursion++;
                        } else {
                            recursion--;
                            if (recursion == 0) {
                                if (i <= lastgoodlimit) {
                                    found = true;
                                }
                                // we've found a starting tag for this
                                // "good one"
                                break;
                            }
                        }
                    }
                }
            }
            // if we coud find a starting,
            // this is a "good one"
            if (found) {
                break;
            }
            lastgood--;
        }

        boolean changed = true;
        while (changed) {
            changed = false;
            boolean removeTags = Core.getFilterMaster().getConfig().isRemoveTags();
            if (!removeTags) {
                for (int i = 0; i < firstgood; i++) {
                    Node node = all.get(i);
                    if (node instanceof Tag) {
                        firstgood = i;
                        changed = true;
                        break;
                    }
                }
                for (int i = all.size() - 1; i > lastgood; i--) {
                    Node node = all.get(i);
                    if (node instanceof Tag) {
                        lastgood = i;
                        changed = true;
                        break;
                    }
                }
            }

            boolean removeSpacesAround = Core.getFilterMaster().getConfig().isRemoveSpacesNonseg();
            if (!removeSpacesAround) {
                for (int i = 0; i < firstgood; i++) {
                    Node node = all.get(i);
                    if (node instanceof TextNode) {
                        firstgood = i;
                        changed = true;
                        break;
                    }
                }
                for (int i = all.size() - 1; i > lastgood; i--) {
                    Node node = all.get(i);
                    if (node instanceof TextNode) {
                        lastgood = i;
                        changed = true;
                        break;
                    }
                }
            }
        }

        // writing out all tags before the "first good" one
        for (int i = 0; i < firstgood; i++) {
            Node node = all.get(i);
            if (node instanceof Tag) {
                writeout("<" + node.getText() + ">");
            } else {
                writeout(compressWhitespace(node.getText()));
            }
        }

        // appending all tags until "last good" one to paragraph text
        StringBuilder paragraph = new StringBuilder();
        // appending all tags starting from "first good" one to paragraph text
        for (int i = firstgood; i <= lastgood; i++) {
            Node node = all.get(i);
            if (node instanceof Tag) {
                shortcut((Tag) node, paragraph);
            } else { // node instanceof Text
                paragraph.append(entitiesToChars(node.toHtml()));
            }
        }

        String uncompressed = paragraph.toString();
        String compressed = uncompressed;
        String spacePrefix = "";
        String spacePostfix = "";
        int size = uncompressed.length();
        // We're compressing the space if this paragraph wasn't inside <PRE> tag
        // But if the translator does not translate the paragraph,
        // then we write out the uncompressed version,
        // as documented in
        // https://sourceforge.net/p/omegat/bugs/108/
        // The spaces that are around the segment are not removed, unless
        // compressWhitespace option is enabled. Then the spaces are compressed to max 1.
        // (This changes the layout, therefore it is an option)
        if (!preformatting) {

            for (int cp, i = 0; i < size; i += Character.charCount(cp)) {
                cp = uncompressed.codePointAt(i);
                if (!Character.isWhitespace(cp)) {
                    spacePrefix = i == 0 ? "" : uncompressed.substring(0,
                            options.getCompressWhitespace() ? Math.min(i, uncompressed.offsetByCodePoints(i, 1)) : i);
                    break;
                }
            }
            for (int cp, i = size; i > 0; i -= Character.charCount(cp)) {
                cp = uncompressed.codePointBefore(i);
                if (!Character.isWhitespace(cp)) {
                    spacePostfix = i == size ? ""
                            : uncompressed.substring(i, options.getCompressWhitespace()
                                    ? Math.min(uncompressed.offsetByCodePoints(i, 1), size) : size);
                    break;
                }
            }

            if (Core.getFilterMaster().getConfig().isRemoveSpacesNonseg()) {
                compressed = StringUtil.compressSpaces(uncompressed);
            } else {
                compressed = uncompressed;
            }
        }

        // getting the translation
        String translation = filter.privateProcessEntry(compressed, null);

        // writing out uncompressed
        if (compressed.equals(translation) && !options.getCompressWhitespace()) {
            translation = uncompressed;
        }

        // converting & < and > into & < and > respectively
        // note that this doesn't change < and > of tag shortcuts
        translation = charsToEntities(translation);
        // expands tag shortcuts into full-blown tags
        translation = unshorcutize(translation);
        // writing out the paragraph into target file
        writeout(spacePrefix);
        writeout(translation);
        writeout(spacePostfix);

        // writing out all tags after the "last good" one
        for (int i = lastgood + 1; i < all.size(); i++) {
            Node node = all.get(i);
            if (node instanceof Tag) {
                writeout("<" + node.getText() + ">");
            } else {
                writeout(compressWhitespace(node.getText()));
            }
        }

        cleanup();
    }

    /**
     * Inits a new paragraph.
     */
    private void cleanup() {
        text = false;
        recurse = true;
        // paragraph = new StringBuffer();
        befors = new ArrayList<>();
        translatable = new ArrayList<>();
        afters = new ArrayList<>();
        sTags = new ArrayList<>();
        sTagNumbers = new ArrayList<>();
        sShortcuts = new ArrayList<>();
        sNumShortcuts = 0;
    }

    /**
     * Creates and stores a shortcut for the tag.
     */
    private void shortcut(Tag tag, StringBuilder paragraph) {
        StringBuilder result = new StringBuilder();
        result.append('<');
        int n = -1;
        if (tag.isEndTag()) {
            result.append('/');
            // trying to lookup for appropriate starting tag
            int recursion = 1;
            for (int i = sTags.size() - 1; i >= 0; i--) {
                Tag othertag = sTags.get(i);
                if (othertag.getTagName().equals(tag.getTagName())) {
                    if (othertag.isEndTag()) {
                        recursion++;
                    } else {
                        recursion--;
                        if (recursion == 0) {
                            // we've found a starting tag for this ending one
                            // !!!
                            n = sTagNumbers.get(i);
                            break;
                        }
                    }
                }
            }
            if (n < 0) {
                // ending tag without a starting one
                n = sNumShortcuts;
                sNumShortcuts++;
            }
        } else {
            n = sNumShortcuts;
            sNumShortcuts++;
        }

        // special handling for BR tag, as it's given a two-char shortcut
        // to allow for its segmentation in sentence-segmentation mode
        // idea by Jean-Christophe Helary
        if ("BR".equals(tag.getTagName())) {
            result.append("br");
        } else {
            result.appendCodePoint(Character.toLowerCase(tag.getTagName().codePointAt(0)));
        }

        result.append(n);
        if (tag.isEmptyXmlTag()) { // This only detects tags that already have a
                                   // slash in the source,
            result.append('/'); // but ignores HTML 4.x style <br>, <img>, and
                                // similar tags without one
                                // The code below would fix that, but breaks
                                // backwards compatibility
                                // with previously translated HTML files
        }
        // if (tag.isEmptyXmlTag() || tag.getTagName().equals("BR") ||
        // tag.getTagName().equals("IMG"))
        // result.append('/');
        result.append('>');

        String shortcut = result.toString();
        sTags.add(tag);
        sTagNumbers.add(n);
        sShortcuts.add(shortcut);
        paragraph.append(shortcut);
    }

    /**
     * Recovers tag shortcuts into full tags.
     */
    private String unshorcutize(String str) {
        for (int i = 0; i < sShortcuts.size(); i++) {
            String shortcut = sShortcuts.get(i);
            int pos = -1;
            while ((pos = str.indexOf(shortcut, pos + 1)) >= 0) {
                Tag tag = sTags.get(i);
                try {
                    str = str.substring(0, pos) + "<" + tag.getText() + ">"
                            + str.substring(pos + shortcut.length());
                } catch (StringIndexOutOfBoundsException sioobe) {
                    // nothing, string doesn't change
                    // but prevent endless loop
                    break;
                }
            }
        }
        return str;
    }

    /**
     * Queues the text to the translatable paragraph.
     * <p>
     * Note that the queued text (if not-purely-whitespace) will also append the
     * previously queued tags and whitespace tags to the translatable paragraph.
     * <p>
     * Whitespace text is simply added to the queue.
     */
    private void queueTranslatable(Text txt) {
        if (!txt.toHtml().trim().isEmpty()) {
            translatable.addAll(afters);
            afters.clear();
            translatable.add(txt);
        } else {
            afters.add(txt);
        }
    }

    /**
     * Queues the tag to the translatable paragraph.
     * <p>
     * Note that the tag is simply added to the queue, and will be appended to
     * the translatable text only if some meaningful text follows it.
     */
    private void queueTranslatable(Tag tag) {
        afters.add(tag);
    }

    /**
     * Queues up something, possibly before a text. If the text is collected
     * now, the tag is queued up as translatable by calling
     * {@link #queueTranslatable(Tag)}, otherwise it's collected to a special
     * list that is inspected when the translatable text is sent to OmegaT core.
     */
    protected void queuePrefix(Tag tag) {
        if (text) {
            queueTranslatable(tag);
        } else if (isParagraphTag(tag)) {
            flushbefors();
            writeout("<" + tag.getText() + ">");
        } else {
            befors.add(tag);
        }
    }

    /**
     * Queues up some text, possibly before a meaningful text. If the text is
     * collected now, the tag is queued up as translatable by calling
     * {@link #queueTranslatable(Tag)}, otherwise it's collected to a special
     * list that is inspected when the translatable text is sent to OmegaT core.
     */
    private void queuePrefix(Text txt) {
        befors.add(txt);
    }

    /** Saves "Befors" to output stream and cleans the list. */
    private void flushbefors() {
        for (Node node : befors) {
            if (node instanceof Tag) {
                writeout("<" + node.getText() + ">");
            } else {
                writeout(compressWhitespace(node.getText()));
            }
        }
        befors.clear();
    }

    /**
     * Remove consecutive whitespace if
     * {@code options.getCompressWhitespace()==true}, and only space+tab is
     * removed. Newlines are not touched, to preserve the layout a little more.
     * <p>
     * NB: We cannot use {@code StaticUtils.compressSpaces}, because it trims a
     * string consisting of only whitespace to the empty string.
     *
     * @param input
     *            some text outside / between tags where it is allowed to
     *            compress spaces.
     * @return the compressed input.
     */
    private String compressWhitespace(String input) {
        if (options.getCompressWhitespace()) {
            Matcher whitespaceMatch = PatternConsts.SPACE_TAB.matcher(input);
            // keep at least 1 space, as not to change the meaning of the document.
            return whitespaceMatch.replaceAll(" ");
        } else {
            return input;
        }
    }

    /** Named HTML Entities and corresponding numeric character references */
    private static final Object[][] ENTITIES = {
            { "quot", 34 },
            { "amp", 38 },
            { "lt", 60 },
            { "gt", 62 },

            // Latin Extended-A
            { "OElig", 338 }, // latin capital ligature OE, U+0152 ISOlat2
            { "oelig", 339 }, // latin small ligature oe, U+0153 ISOlat2
                              // ligature is a misnomer, this is a separate
                              // character in some languages
            { "Scaron", 352 }, // latin capital letter S with caron, U+0160 ISOlat2
            { "scaron", 353 }, // latin small letter s with caron, U+0161 ISOlat2
            { "Yuml", 376 }, // latin capital letter Y with diaeresis, U+0178 ISOlat2

            // Spacing Modifier Letters
            { "circ", 710 }, // modifier letter circumflex accent, U+02C6 ISOpub
            { "tilde", 732 }, // small tilde, U+02DC ISOdia

            // General Punctuation
            { "ensp", 8194 }, // en space, U+2002 ISOpub
            { "emsp", 8195 }, // em space, U+2003 ISOpub
            { "thinsp", 8201 }, // thin space, U+2009 ISOpub
            { "zwnj", 8204 }, // zero width non-joiner, U+200C NEW RFC 2070
            { "zwj", 8205 }, // zero width joiner, U+200D NEW RFC 2070
            { "lrm", 8206 }, // left-to-right mark, U+200E NEW RFC 2070
            { "rlm", 8207 }, // right-to-left mark, U+200F NEW RFC 2070
            { "ndash", 8211 }, // en dash, U+2013 ISOpub
            { "mdash", 8212 }, // em dash, U+2014 ISOpub
            { "lsquo", 8216 }, // left single quotation mark, U+2018 ISOnum
            { "rsquo", 8217 }, // right single quotation mark, U+2019 ISOnum
            { "sbquo", 8218 }, // single low-9 quotation mark, U+201A NEW
            { "ldquo", 8220 }, // left double quotation mark, U+201C ISOnum
            { "rdquo", 8221 }, // right double quotation mark, U+201D ISOnum
            { "bdquo", 8222 }, // double low-9 quotation mark, U+201E NEW
            { "dagger", 8224 }, // dagger, U+2020 ISOpub
            { "Dagger", 8225 }, // double dagger, U+2021 ISOpub
            { "permil", 8240 }, // per mille sign, U+2030 ISOtech
            { "lsaquo", 8249 }, // single left-pointing angle quotation mark, U+2039 ISO
                                // proposed: lsaquo is proposed but not yet ISO standardized
            { "rsaquo", 8250 }, // single right-pointing angle quotation mark, U+203A ISO
                                // proposed: rsaquo is proposed but not yet ISO standardized
            { "euro", 8364 }, // euro sign, U+20AC NEW

            { "nbsp", 160 }, { "iexcl", 161 }, { "cent", 162 },
            { "pound", 163 }, { "curren", 164 }, { "yen", 165 },
            { "brvbar", 166 }, { "sect", 167 }, { "uml", 168 },
            { "copy", 169 }, { "ordf", 170 }, { "laquo", 171 },
            { "not", 172 }, { "shy", 173 }, { "reg", 174 },
            { "macr", 175 }, { "deg", 176 }, { "plusmn", 177 },
            { "sup2", 178 }, { "sup3", 179 }, { "acute", 180 },
            { "micro", 181 }, { "para", 182 }, { "middot", 183 },
            { "cedil", 184 }, { "sup1", 185 }, { "ordm", 186 },
            { "raquo", 187 }, { "frac14", 188 }, { "frac12", 189 },
            { "frac34", 190 }, { "iquest", 191 }, { "Agrave", 192 },
            { "Aacute", 193 }, { "Acirc", 194 }, { "Atilde", 195 },
            { "Auml", 196 }, { "Aring", 197 }, { "AElig", 198 },
            { "Ccedil", 199 }, { "Egrave", 200 }, { "Eacute", 201 },
            { "Ecirc", 202 }, { "Euml", 203 }, { "Igrave", 204 },
            { "Iacute", 205 }, { "Icirc", 206 }, { "Iuml", 207 },
            { "ETH", 208 }, { "Ntilde", 209 }, { "Ograve", 210 },
            { "Oacute", 211 }, { "Ocirc", 212 }, { "Otilde", 213 },
            { "Ouml", 214 }, { "times", 215 }, { "Oslash", 216 },
            { "Ugrave", 217 }, { "Uacute", 218 }, { "Ucirc", 219 },
            { "Uuml", 220 }, { "Yacute", 221 }, { "THORN", 222 },
            { "szlig", 223 }, { "agrave", 224 }, { "aacute", 225 },
            { "acirc", 226 }, { "atilde", 227 }, { "auml", 228 },
            { "aring", 229 }, { "aelig", 230 }, { "ccedil", 231 },
            { "egrave", 232 }, { "eacute", 233 }, { "ecirc", 234 },
            { "euml", 235 }, { "igrave", 236 }, { "iacute", 237 },
            { "icirc", 238 }, { "iuml", 239 }, { "eth", 240 },
            { "ntilde", 241 }, { "ograve", 242 }, { "oacute", 243 },
            { "ocirc", 244 }, { "otilde", 245 }, { "ouml", 246 },
            { "divide", 247 }, { "oslash", 248 }, { "ugrave", 249 },
            { "uacute", 250 }, { "ucirc", 251 }, { "uuml", 252 },
            { "yacute", 253 }, { "thorn", 254 }, { "yuml", 255 },

            { "fnof", 402 },

            { "Alpha", 913 }, { "Beta", 914 }, { "Gamma", 915 },
            { "Delta", 916 }, { "Epsilon", 917 }, { "Zeta", 918 },
            { "Eta", 919 }, { "Theta", 920 }, { "Iota", 921 },
            { "Kappa", 922 }, { "Lambda", 923 }, { "Mu", 924 },
            { "Nu", 925 }, { "Xi", 926 }, { "Omicron", 927 },
            { "Pi", 928 }, { "Rho", 929 }, { "Sigma", 931 },
            { "Tau", 932 }, { "Upsilon", 933 }, { "Phi", 934 },
            { "Chi", 935 }, { "Psi", 936 }, { "Omega", 937 },
            { "alpha", 945 }, { "beta", 946 }, { "gamma", 947 },
            { "delta", 948 }, { "epsilon", 949 }, { "zeta", 950 },
            { "eta", 951 }, { "theta", 952 }, { "iota", 953 },
            { "kappa", 954 }, { "lambda", 955 }, { "mu", 956 },
            { "nu", 957 }, { "xi", 958 }, { "omicron", 959 },
            { "pi", 960 }, { "rho", 961 }, { "sigmaf", 962 },
            { "sigma", 963 }, { "tau", 964 }, { "upsilon", 965 },
            { "phi", 966 }, { "chi", 967 }, { "psi", 968 },
            { "omega", 969 }, { "thetasym", 977 }, { "upsih", 978 },
            { "piv", 982 },

            { "bull", 8226 }, { "hellip", 8230 }, { "prime", 8242 },
            { "Prime", 8243 }, { "oline", 8254 }, { "frasl", 8260 },

            { "weierp", 8472 }, { "image", 8465 }, { "real", 8476 },
            { "trade", 8482 }, { "alefsym", 8501 },

            { "larr", 8592 }, { "uarr", 8593 }, { "rarr", 8594 },
            { "darr", 8595 }, { "harr", 8596 }, { "crarr", 8629 },
            { "lArr", 8656 }, { "uArr", 8657 }, { "rArr", 8658 },
            { "dArr", 8659 }, { "hArr", 8660 },

            { "forall", 8704 }, { "part", 8706 }, { "exist", 8707 },
            { "empty", 8709 }, { "nabla", 8711 }, { "isin", 8712 },
            { "notin", 8713 }, { "ni", 8715 }, { "prod", 8719 },
            { "sum", 8722 }, { "minus", 8722 }, { "lowast", 8727 },
            { "radic", 8730 }, { "prop", 8733 }, { "infin", 8734 },
            { "ang", 8736 }, { "and", 8869 }, { "or", 8870 },
            { "cap", 8745 }, { "cup", 8746 }, { "int", 8747 },
            { "there4", 8756 }, { "sim", 8764 }, { "cong", 8773 },
            { "asymp", 8773 }, { "ne", 8800 }, { "equiv", 8801 },
            { "le", 8804 }, { "ge", 8805 }, { "sub", 8834 },
            { "sup", 8835 }, { "nsub", 8836 }, { "sube", 8838 },
            { "supe", 8839 }, { "oplus", 8853 }, { "otimes", 8855 },
            { "perp", 8869 }, { "sdot", 8901 },

            { "lceil", 8968 }, { "rceil", 8969 }, { "lfloor", 8970 },
            { "rfloor", 8971 }, { "lang", 9001 }, { "rang", 9002 },

            { "loz", 9674 },

            { "spades", 9824 }, { "clubs", 9827 }, { "hearts", 9829 },
            { "diams", 9830 } };

    /** Converts HTML entities to normal characters */
    protected String entitiesToChars(String str) {
        int strlen = str.length();
        StringBuilder res = new StringBuilder(strlen);
        for (int cp, i = 0; i < strlen; i += Character.charCount(cp)) {
            cp = str.codePointAt(i);
            switch (cp) {
            case '&':
                int cp1;
                // if there's one more symbol, reading it,
                // otherwise it's a dangling '&'
                if (str.codePointCount(i, strlen) < 2) {
                    res.appendCodePoint(cp);
                    break;
                } else {
                    cp1 = str.codePointAt(str.offsetByCodePoints(i, 1));
                }
                if (cp1 == '#') {
                    // numeric entity
                    int cp2 = str.codePointAt(str.offsetByCodePoints(i, 2));
                    if (cp2 == 'x' || cp2 == 'X') {
                        // hex numeric entity
                        int hexStart = str.offsetByCodePoints(i, 3);
                        int hexEnd = hexStart;
                        while (hexEnd < strlen) {
                            int hexCp = str.codePointAt(hexEnd);
                            if (!isHexDigit(hexCp)) {
                                break;
                            }
                            hexEnd += Character.charCount(hexCp);
                        }
                        String sEntity = str.substring(hexStart, hexEnd);
                        try {
                            int nEntity = Integer.parseInt(sEntity, 16);
                            if (nEntity > 0 && nEntity <= 0x10FFFF) {
                                res.appendCodePoint(nEntity);
                                if (hexEnd < strlen && str.codePointAt(hexEnd) == ';') {
                                    i = hexEnd;
                                } else {
                                    i = str.offsetByCodePoints(hexEnd, -1);
                                }
                            } else {
                                // too big number
                                // dangling '&'
                                res.appendCodePoint(cp);
                            }
                        } catch (NumberFormatException nfe) {
                            // do nothing
                            // dangling '&'
                            res.appendCodePoint(cp);
                        }
                    } else {
                        // decimal entity
                        int decStart = str.offsetByCodePoints(i, 2);
                        int decEnd = decStart;
                        while (decEnd < strlen) {
                            int decCp = str.codePointAt(decEnd);
                            if (!isDecimalDigit(decCp)) {
                                break;
                            }
                            decEnd += Character.charCount(decCp);
                        }
                        String sEntity = str.substring(decStart, decEnd);
                        try {
                            int nEntity = Integer.parseInt(sEntity, 10);
                            if (nEntity > 0 && nEntity <= 0x10FFFF) {
                                res.appendCodePoint(nEntity);
                                if (decEnd < strlen && str.codePointAt(decEnd) == ';') {
                                    i = decEnd;
                                } else {
                                    i = str.offsetByCodePoints(decEnd, -1);
                                }
                            } else {
                                // too big number
                                // dangling '&'
                                res.appendCodePoint(cp);
                            }
                        } catch (NumberFormatException nfe) {
                            // do nothing
                            // dangling '&'
                            res.appendCodePoint(cp);
                        }
                    }
                } else if (isLatinLetter(cp1)) {
                    // named entity?
                    int entStart = str.offsetByCodePoints(i, 1);
                    int entEnd = entStart;
                    while (entEnd < strlen) {
                        int entCp = str.codePointAt(entEnd);
                        // Some entities contain numbers, e.g. frac12
                        if (!isLatinLetter(entCp) && !isDecimalDigit(entCp)) {
                            break;
                        }
                        entEnd += Character.charCount(entCp);
                    }
                    String sEntity = str.substring(entStart, entEnd);
                    int nEntity = lookupEntity(sEntity);
                    if (nEntity > 0 && nEntity <= 65535) {
                        res.append((char) nEntity);
                        if (entEnd < strlen && str.codePointAt(entEnd) == ';') {
                            i = entEnd;
                        } else {
                            i = str.offsetByCodePoints(entEnd, -1);
                        }
                    } else {
                        // too big number
                        // dangling '&'
                        res.appendCodePoint(cp);
                    }
                } else {
                    // dangling '&'
                    res.appendCodePoint(cp);
                }
                break;
            default:
                res.appendCodePoint(cp);
            }
        }
        return res.toString();
    }

    /** Returns true if a char is a latin letter */
    private boolean isLatinLetter(int ch) {
        return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
    }

    /** Returns true if a char is a decimal digit */
    private boolean isDecimalDigit(int ch) {
        return (ch >= '0' && ch <= '9');
    }

    /** Returns true if a char is a hex digit */
    private boolean isHexDigit(int ch) {
        return (ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
    }

    /**
     * returns a character for HTML entity, or -1 if the passed string is not an
     * entity
     */
    private int lookupEntity(String entity) {
        for (int i = 0; i < ENTITIES.length; i++) {
            Object[] onent = ENTITIES[i];
            if (entity.equals(onent[0])) {
                return ((Integer) onent[1]).intValue();
            }
        }
        return -1;
    }

    /**
     * Converts characters that must be converted (< > & ' '
     * (nbsp)) into HTML entities
     */
    protected String charsToEntities(String str) {
        int strlen = str.length();
        StringBuilder res = new StringBuilder(strlen * 5);
        for (int cp, i = 0; i < strlen; i += Character.charCount(cp)) {
            cp = str.codePointAt(i);
            switch (cp) {
            case '\u00A0':
                res.append(" ");
                break;
            case '&':
                res.append("&");
                break;
            case '>':
                // If it's the end of a processing instruction
                if ((i > 0) && str.codePointBefore(i) == '?') {
                   res.append(">");
                } else {
                    res.append(">");
                }
                break;
            case '<':
                int qMarkPos = str.indexOf('?', i);
                // If it's the beginning of a processing instruction
                if (qMarkPos == str.offsetByCodePoints(i, 1)) {
                    res.append("<");
                    break;
                }
                int gtpos = str.indexOf('>', i);
                if (gtpos >= 0) {
                    String maybeShortcut = str.substring(i, str.offsetByCodePoints(gtpos, 1));
                    boolean foundShortcut = false; // here because it's
                                                   // impossible to step out of
                                                   // two loops at once
                    for (String currShortcut : sShortcuts) {
                        if (maybeShortcut.equals(currShortcut)) {
                            // skipping the conversion of < into <
                            // because it's a part of the tag
                            foundShortcut = true;
                            break;
                        }
                    }
                    if (foundShortcut) {
                        res.append(maybeShortcut);
                        i = gtpos;
                        continue;
                    } else {
                        // dangling <
                        res.append("<");
                    }
                } else {
                    // dangling <
                    res.append("<");
                }
                break;
            default:
                res.appendCodePoint(cp);
            }
        }
        String contents = res.toString();
        // Rewrite characters that cannot be encoded to html character strings.
        // Each character in the contents-string is checked. If a character
        // can't be encoded, all its occurrences are replaced with the
        // html-equivalent string.
        // Then, the next character is checked.
        // (The loop over the contents-string is restarted for the modified
        // content, but the starting-position will be the position where the
        // last unencodable character was found)
        // [1802000] HTML filter loses html-encoded characters if not supported
        String encoding = this.filter.getTargetEncoding();
        if (encoding != null) {
            CharsetEncoder charsetEncoder = Charset.forName(encoding).newEncoder();
            int i = 0;
            while (true) {
                String substring;
                for (int cp; i < contents.length(); i += substring.length()) {
                    cp = contents.codePointAt(i);
                    substring = contents.substring(i, i + Character.charCount(cp));
                    if (!charsetEncoder.canEncode(substring)) {
                        String replacement = "&#" + cp + ';';
                        contents = contents.replaceAll(Pattern.quote(substring), replacement);
                        break;
                    }
                }
                if (i == contents.length()) {
                    break;
                }
            }
        }
        return contents;
    }

}