/************************************************************************** OmegaT - Computer Assisted Translation (CAT) tool with fuzzy matching, translation memory, keyword search, glossaries, and translation leveraging into updated projects. Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk 2007-2008 Didier Briel, Martin Fleurke 2010 Didier Briel 2011 Didier Briel, Martin Fleurke 2012 Didier Briel, Martin Fleurke 2013 Didier Briel, Alex Buloichik Home page: http://www.omegat.org/ Support center: http://groups.yahoo.com/group/OmegaT/ This file is part of OmegaT. OmegaT is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. OmegaT is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. **************************************************************************/ package org.omegat.filters2.html2; import java.io.BufferedWriter; import java.io.IOException; import java.nio.charset.Charset; import java.nio.charset.CharsetEncoder; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.TreeMap; import java.util.Vector; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.htmlparser.Attribute; import org.htmlparser.Node; import org.htmlparser.Remark; import org.htmlparser.Tag; import org.htmlparser.Text; import org.htmlparser.nodes.TextNode; import org.htmlparser.visitors.NodeVisitor; import org.omegat.core.Core; import org.omegat.util.OStrings; import org.omegat.util.PatternConsts; import org.omegat.util.StringUtil; /** * The part of HTML filter that actually does the job. This class is called back * by HTMLParser (http://sf.net/projects/htmlparser/). * * @author Maxym Mykhalchuk * @author Didier Briel * @author Henry Pijffers (henry.pijffers@saxnot.com) * @author Martin Fleurke */ public class FilterVisitor extends NodeVisitor { protected HTMLFilter2 filter; private BufferedWriter writer; private HTMLOptions options; public FilterVisitor(HTMLFilter2 htmlfilter, BufferedWriter bufwriter, HTMLOptions opts) { this.filter = htmlfilter; // HHC filter has no options if (opts != null) { this.options = opts; } else { // To prevent a null pointer exception later, see https://sourceforge.net/p/omegat/bugs/651/ this.options = new HTMLOptions(new TreeMap<String, String>()); } this.writer = bufwriter; } // /////////////////////////////////////////////////////////////////////// // Variable declaration // /////////////////////////////////////////////////////////////////////// /** Should the parser call us for this tag's ending tag and its inner tags. */ boolean recurse = true; /** Do we collect the translatable text now. */ boolean text = false; /** The translatable text being collected. */ // StringBuffer paragraph; /** Did the PRE block start (it means we mustn't compress the spaces). */ boolean preformatting = false; /** * The list of non-paragraph tags before a chunk of text. * <ul> * <li>If a chunk of text follows, they get prepended to the translatable * paragraph, (starting from the first tag having a pair inside a chunk of * text) * <li>Otherwise they are written out directly. * </ul> */ List<Node> befors; /** The list of nodes forming a chunk of text. */ List<Node> translatable; /** * The list of non-paragraph tags following a chunk of text. * <ul> * <li>If another chunk of text follows, they get appended to the * translatable paragraph, * <li>Otherwise (paragraph tag follows), they are written out directly. * </ul> */ List<Node> afters; /** The tags behind the shortcuts */ List<Tag> sTags; /** The tag numbers of shorcutized tags */ List<Integer> sTagNumbers; /** The list of all the tag shortcuts */ List<String> sShortcuts; /** The number of shortcuts stored */ int sNumShortcuts; /** * Self traversal predicate. * * @return <code>true</code> if a node itself is to be visited. */ @Override public boolean shouldRecurseSelf() { return recurse; } /** * Depth traversal predicate. * * @return <code>true</code> if children are to be visited. */ @Override public boolean shouldRecurseChildren() { return recurse; } /** * Called for each <code>Tag</code> visited. * * @param tag * The tag being visited. */ @Override public void visitTag(Tag tag) { boolean intactTag = isIntactTag(tag); if (!intactTag) { // If it's an intact tag, no reason to check // Decide whether this tag should be intact, based on the key-value pairs stored in the // configuration Vector<?> tagAttributes = tag.getAttributesEx(); Iterator<?> i = tagAttributes.iterator(); while (i.hasNext() && !intactTag) { Attribute attribute = (Attribute) i.next(); String name = attribute.getName(); String value = attribute.getValue(); if (name == null || value == null) { continue; } intactTag = this.filter.checkIgnoreTags(name, value); } } if (intactTag) { if (text) { endup(); } else { flushbefors(); } writeout(tag.toHtml()); if (tag.getEndTag() != null) { recurse = false; } } else { // recurse = true; if (isParagraphTag(tag) && text) { endup(); } if (isPreformattingTag(tag) || Core.getFilterMaster().getConfig().isPreserveSpaces()) { preformatting = true; } // Translate attributes of tags if they are not null. maybeTranslateAttribute(tag, "abbr"); maybeTranslateAttribute(tag, "alt"); if (options.getTranslateHref()) { maybeTranslateAttribute(tag, "href"); } if (options.getTranslateHreflang()) { maybeTranslateAttribute(tag, "hreflang"); } if (options.getTranslateLang()) { maybeTranslateAttribute(tag, "lang"); maybeTranslateAttribute(tag, "xml:lang"); } maybeTranslateAttribute(tag, "label"); if ("IMG".equals(tag.getTagName()) && options.getTranslateSrc()) { maybeTranslateAttribute(tag, "src"); } maybeTranslateAttribute(tag, "summary"); maybeTranslateAttribute(tag, "title"); if ("INPUT".equals(tag.getTagName())) { //an input element if (options.getTranslateValue() //and we translate all input elements || options.getTranslateButtonValue() // or we translate submit/button/reset elements ... && ("submit".equalsIgnoreCase(tag.getAttribute("type")) || "button".equalsIgnoreCase(tag.getAttribute("type")) || "reset".equalsIgnoreCase(tag.getAttribute("type")) ) //and it is a submit/button/reset element. ) { //then translate the value maybeTranslateAttribute(tag, "value"); } maybeTranslateAttribute(tag, "placeholder"); } // Special handling of meta-tag: depending on the other attributes // the contents-attribute should or should not be translated. // The group of attribute-value pairs indicating non-translation // are stored in the configuration if ("META".equals(tag.getTagName())) { Vector<?> tagAttributes = tag.getAttributesEx(); Iterator<?> i = tagAttributes.iterator(); boolean doSkipMetaTag = false; while (i.hasNext() && !doSkipMetaTag) { Attribute attribute = (Attribute) i.next(); String name = attribute.getName(); String value = attribute.getValue(); if (name == null || value == null) { continue; } doSkipMetaTag = this.filter.checkDoSkipMetaTag(name, value); } if (!doSkipMetaTag) { maybeTranslateAttribute(tag, "content"); } } queuePrefix(tag); } } /** * If the attribute of the tag is not empty, it translates it as a separate * segment. * * @param tag * the tag object * @param key * the name of the attribute */ protected void maybeTranslateAttribute(Tag tag, String key) { String attr = tag.getAttribute(key); if (attr != null) { String comment = OStrings.getString("HTMLFILTER_TAG") + " " + tag.getTagName() + " " + OStrings.getString("HTMLFILTER_ATTRIBUTE") + " " + key; String trans = filter.privateProcessEntry(entitiesToChars(attr), comment); tag.setAttribute(key, charsToEntities(trans)); } } boolean firstcall = true; /** * Called for each chunk of text (<code>StringNode</code>) visited. * * @param string * The string node being visited. */ @Override public void visitStringNode(Text string) { recurse = true; // nbsp is special case - process it like usual spaces String trimmedtext = entitiesToChars(string.getText()).replace((char) 160, ' ').trim(); if (!trimmedtext.isEmpty()) { // Hack around HTMLParser not being able to handle XHTML // RFE pending: // http://sourceforge.net/tracker/index.php?func=detail&aid=1227222&group_id=24399&atid=381402 if (firstcall && PatternConsts.XML_HEADER.matcher(trimmedtext).matches()) { writeout(string.toHtml()); return; } text = true; firstcall = false; } if (text) { queueTranslatable(string); } else { queuePrefix(string); } } /** * Called for each comment (<code>RemarkNode</code>) visited. * * @param remark * The remark node being visited. */ @Override public void visitRemarkNode(Remark remark) { recurse = true; if (text) { endup(); } if (!options.getRemoveComments()) { writeout(remark.toHtml()); } } /** * Called for each end <code>Tag</code> visited. * * @param tag * The end tag being visited. */ @Override public void visitEndTag(Tag tag) { recurse = true; if (isParagraphTag(tag) && text) { endup(); } if (isPreformattingTag(tag)) { preformatting = false; } queuePrefix(tag); } /** * This method is called before the parsing. */ @Override public void beginParsing() { cleanup(); } /** * Called upon parsing completion. */ @Override public void finishedParsing() { if (text) { endup(); } else { flushbefors(); } } /** * Does the tag lead to starting (ending) a paragraph. * <p> * Contains code donated by JC to have dictionary list parsed as segmenting. * * @see <a href="https://sourceforge.net/p/omegat/feature-requests/102/">RFE * #102</a> */ private boolean isParagraphTag(Tag tag) { String tagname = tag.getTagName(); return // Bugfix for https://sourceforge.net/p/omegat/bugs/84/ // ADDRESS tag is also a paragraph tag tagname.equals("ADDRESS") || tagname.equals("BLOCKQUOTE") || tagname.equals("BODY") || tagname.equals("CENTER") || tagname.equals("DIV") || tagname.equals("H1") || tagname.equals("H2") || tagname.equals("H3") || tagname.equals("H4") || tagname.equals("H5") || tagname.equals("H6") || tagname.equals("HTML") || tagname.equals("HEAD") || tagname.equals("TITLE") || tagname.equals("TABLE") || tagname.equals("TR") || tagname.equals("TD") || tagname.equals("TH") || tagname.equals("P") || tagname.equals("PRE") || tagname.equals("OL") || tagname.equals("UL") || tagname.equals("LI") || // Added by JC to have dictionary list parsed as segmenting. tagname.equals("DL") || tagname.equals("DT") || tagname.equals("DD") || // End of JC's contribution tagname.equals("FORM") || tagname.equals("TEXTAREA") || tagname.equals("FIELDSET") || tagname.equals("LEGEND") || tagname.equals("LABEL") || tagname.equals("SELECT") || tagname.equals("OPTION") || tagname.equals("HR") // Optional paragraph on BR || (tagname.equals("BR") && options.getParagraphOnBr()); } /** Should a contents of this tag be kept intact? */ private boolean isIntactTag(Tag tag) { String tagname = tag.getTagName(); return tagname.equals("!DOCTYPE") || tagname.equals("STYLE") || tagname.equals("SCRIPT") || tagname.equals("OBJECT") || tagname.equals("EMBED") || (tagname.equals("META") && "content-type".equalsIgnoreCase(tag.getAttribute("http-equiv"))); } /** Is the tag space-preserving? */ private boolean isPreformattingTag(Tag tag) { String tagname = tag.getTagName(); return tagname.equals("PRE") || tagname.equals("TEXTAREA"); } /** Writes something to writer. */ private void writeout(String something) { try { writer.write(something); } catch (IOException ioe) { System.out.println(ioe); } } /** * Ends the segment collection and sends the translatable text out to OmegaT * core, and some extra tags to writer. */ protected void endup() { // detecting the first starting tag in 'befors' // that has its ending in the paragraph // all before this "first good" are simply written out List<Node> all = new ArrayList<Node>(); all.addAll(befors); all.addAll(translatable); int firstgoodlimit = befors.size(); int firstgood = 0; while (firstgood < firstgoodlimit) { Node goodNode = all.get(firstgood); if (!(goodNode instanceof Tag)) { firstgood++; continue; } Tag good = (Tag) goodNode; // trying to test int recursion = 1; boolean found = false; for (int i = firstgood + 1; i < all.size(); i++) { Node candNode = all.get(i); if (candNode instanceof Tag) { Tag cand = (Tag) candNode; if (cand.getTagName().equals(good.getTagName())) { if (!cand.isEndTag()) { recursion++; } else { recursion--; if (recursion == 0) { if (i >= firstgoodlimit) { found = true; } // we've found an ending tag for this "good one" break; } } } } } // if we could find an ending, // this is a "good one" if (found) { break; } firstgood++; } // detecting the last ending tag in 'afters' // that has its starting in the paragraph // all after this "last good" is simply writen out int lastgoodlimit = all.size() - 1; all.addAll(afters); int lastgood = all.size() - 1; while (lastgood > lastgoodlimit) { Node goodNode = all.get(lastgood); if (!(goodNode instanceof Tag)) { lastgood--; continue; } Tag good = (Tag) goodNode; // trying to test int recursion = 1; boolean found = false; for (int i = lastgood - 1; i >= firstgoodlimit; i--) { Node candNode = all.get(i); if (candNode instanceof Tag) { Tag cand = (Tag) candNode; if (cand.getTagName().equals(good.getTagName())) { if (cand.isEndTag()) { recursion++; } else { recursion--; if (recursion == 0) { if (i <= lastgoodlimit) { found = true; } // we've found a starting tag for this // "good one" break; } } } } } // if we coud find a starting, // this is a "good one" if (found) { break; } lastgood--; } boolean changed = true; while (changed) { changed = false; boolean removeTags = Core.getFilterMaster().getConfig().isRemoveTags(); if (!removeTags) { for (int i = 0; i < firstgood; i++) { Node node = all.get(i); if (node instanceof Tag) { firstgood = i; changed = true; break; } } for (int i = all.size() - 1; i > lastgood; i--) { Node node = all.get(i); if (node instanceof Tag) { lastgood = i; changed = true; break; } } } boolean removeSpacesAround = Core.getFilterMaster().getConfig().isRemoveSpacesNonseg(); if (!removeSpacesAround) { for (int i = 0; i < firstgood; i++) { Node node = all.get(i); if (node instanceof TextNode) { firstgood = i; changed = true; break; } } for (int i = all.size() - 1; i > lastgood; i--) { Node node = all.get(i); if (node instanceof TextNode) { lastgood = i; changed = true; break; } } } } // writing out all tags before the "first good" one for (int i = 0; i < firstgood; i++) { Node node = all.get(i); if (node instanceof Tag) { writeout("<" + node.getText() + ">"); } else { writeout(compressWhitespace(node.getText())); } } // appending all tags until "last good" one to paragraph text StringBuilder paragraph = new StringBuilder(); // appending all tags starting from "first good" one to paragraph text for (int i = firstgood; i <= lastgood; i++) { Node node = all.get(i); if (node instanceof Tag) { shortcut((Tag) node, paragraph); } else { // node instanceof Text paragraph.append(entitiesToChars(node.toHtml())); } } String uncompressed = paragraph.toString(); String compressed = uncompressed; String spacePrefix = ""; String spacePostfix = ""; int size = uncompressed.length(); // We're compressing the space if this paragraph wasn't inside <PRE> tag // But if the translator does not translate the paragraph, // then we write out the uncompressed version, // as documented in // https://sourceforge.net/p/omegat/bugs/108/ // The spaces that are around the segment are not removed, unless // compressWhitespace option is enabled. Then the spaces are compressed to max 1. // (This changes the layout, therefore it is an option) if (!preformatting) { for (int cp, i = 0; i < size; i += Character.charCount(cp)) { cp = uncompressed.codePointAt(i); if (!Character.isWhitespace(cp)) { spacePrefix = i == 0 ? "" : uncompressed.substring(0, options.getCompressWhitespace() ? Math.min(i, uncompressed.offsetByCodePoints(i, 1)) : i); break; } } for (int cp, i = size; i > 0; i -= Character.charCount(cp)) { cp = uncompressed.codePointBefore(i); if (!Character.isWhitespace(cp)) { spacePostfix = i == size ? "" : uncompressed.substring(i, options.getCompressWhitespace() ? Math.min(uncompressed.offsetByCodePoints(i, 1), size) : size); break; } } if (Core.getFilterMaster().getConfig().isRemoveSpacesNonseg()) { compressed = StringUtil.compressSpaces(uncompressed); } else { compressed = uncompressed; } } // getting the translation String translation = filter.privateProcessEntry(compressed, null); // writing out uncompressed if (compressed.equals(translation) && !options.getCompressWhitespace()) { translation = uncompressed; } // converting & < and > into & < and > respectively // note that this doesn't change < and > of tag shortcuts translation = charsToEntities(translation); // expands tag shortcuts into full-blown tags translation = unshorcutize(translation); // writing out the paragraph into target file writeout(spacePrefix); writeout(translation); writeout(spacePostfix); // writing out all tags after the "last good" one for (int i = lastgood + 1; i < all.size(); i++) { Node node = all.get(i); if (node instanceof Tag) { writeout("<" + node.getText() + ">"); } else { writeout(compressWhitespace(node.getText())); } } cleanup(); } /** * Inits a new paragraph. */ private void cleanup() { text = false; recurse = true; // paragraph = new StringBuffer(); befors = new ArrayList<>(); translatable = new ArrayList<>(); afters = new ArrayList<>(); sTags = new ArrayList<>(); sTagNumbers = new ArrayList<>(); sShortcuts = new ArrayList<>(); sNumShortcuts = 0; } /** * Creates and stores a shortcut for the tag. */ private void shortcut(Tag tag, StringBuilder paragraph) { StringBuilder result = new StringBuilder(); result.append('<'); int n = -1; if (tag.isEndTag()) { result.append('/'); // trying to lookup for appropriate starting tag int recursion = 1; for (int i = sTags.size() - 1; i >= 0; i--) { Tag othertag = sTags.get(i); if (othertag.getTagName().equals(tag.getTagName())) { if (othertag.isEndTag()) { recursion++; } else { recursion--; if (recursion == 0) { // we've found a starting tag for this ending one // !!! n = sTagNumbers.get(i); break; } } } } if (n < 0) { // ending tag without a starting one n = sNumShortcuts; sNumShortcuts++; } } else { n = sNumShortcuts; sNumShortcuts++; } // special handling for BR tag, as it's given a two-char shortcut // to allow for its segmentation in sentence-segmentation mode // idea by Jean-Christophe Helary if ("BR".equals(tag.getTagName())) { result.append("br"); } else { result.appendCodePoint(Character.toLowerCase(tag.getTagName().codePointAt(0))); } result.append(n); if (tag.isEmptyXmlTag()) { // This only detects tags that already have a // slash in the source, result.append('/'); // but ignores HTML 4.x style <br>, <img>, and // similar tags without one // The code below would fix that, but breaks // backwards compatibility // with previously translated HTML files } // if (tag.isEmptyXmlTag() || tag.getTagName().equals("BR") || // tag.getTagName().equals("IMG")) // result.append('/'); result.append('>'); String shortcut = result.toString(); sTags.add(tag); sTagNumbers.add(n); sShortcuts.add(shortcut); paragraph.append(shortcut); } /** * Recovers tag shortcuts into full tags. */ private String unshorcutize(String str) { for (int i = 0; i < sShortcuts.size(); i++) { String shortcut = sShortcuts.get(i); int pos = -1; while ((pos = str.indexOf(shortcut, pos + 1)) >= 0) { Tag tag = sTags.get(i); try { str = str.substring(0, pos) + "<" + tag.getText() + ">" + str.substring(pos + shortcut.length()); } catch (StringIndexOutOfBoundsException sioobe) { // nothing, string doesn't change // but prevent endless loop break; } } } return str; } /** * Queues the text to the translatable paragraph. * <p> * Note that the queued text (if not-purely-whitespace) will also append the * previously queued tags and whitespace tags to the translatable paragraph. * <p> * Whitespace text is simply added to the queue. */ private void queueTranslatable(Text txt) { if (!txt.toHtml().trim().isEmpty()) { translatable.addAll(afters); afters.clear(); translatable.add(txt); } else { afters.add(txt); } } /** * Queues the tag to the translatable paragraph. * <p> * Note that the tag is simply added to the queue, and will be appended to * the translatable text only if some meaningful text follows it. */ private void queueTranslatable(Tag tag) { afters.add(tag); } /** * Queues up something, possibly before a text. If the text is collected * now, the tag is queued up as translatable by calling * {@link #queueTranslatable(Tag)}, otherwise it's collected to a special * list that is inspected when the translatable text is sent to OmegaT core. */ protected void queuePrefix(Tag tag) { if (text) { queueTranslatable(tag); } else if (isParagraphTag(tag)) { flushbefors(); writeout("<" + tag.getText() + ">"); } else { befors.add(tag); } } /** * Queues up some text, possibly before a meaningful text. If the text is * collected now, the tag is queued up as translatable by calling * {@link #queueTranslatable(Tag)}, otherwise it's collected to a special * list that is inspected when the translatable text is sent to OmegaT core. */ private void queuePrefix(Text txt) { befors.add(txt); } /** Saves "Befors" to output stream and cleans the list. */ private void flushbefors() { for (Node node : befors) { if (node instanceof Tag) { writeout("<" + node.getText() + ">"); } else { writeout(compressWhitespace(node.getText())); } } befors.clear(); } /** * Remove consecutive whitespace if * {@code options.getCompressWhitespace()==true}, and only space+tab is * removed. Newlines are not touched, to preserve the layout a little more. * <p> * NB: We cannot use {@code StaticUtils.compressSpaces}, because it trims a * string consisting of only whitespace to the empty string. * * @param input * some text outside / between tags where it is allowed to * compress spaces. * @return the compressed input. */ private String compressWhitespace(String input) { if (options.getCompressWhitespace()) { Matcher whitespaceMatch = PatternConsts.SPACE_TAB.matcher(input); // keep at least 1 space, as not to change the meaning of the document. return whitespaceMatch.replaceAll(" "); } else { return input; } } /** Named HTML Entities and corresponding numeric character references */ private static final Object[][] ENTITIES = { { "quot", 34 }, { "amp", 38 }, { "lt", 60 }, { "gt", 62 }, // Latin Extended-A { "OElig", 338 }, // latin capital ligature OE, U+0152 ISOlat2 { "oelig", 339 }, // latin small ligature oe, U+0153 ISOlat2 // ligature is a misnomer, this is a separate // character in some languages { "Scaron", 352 }, // latin capital letter S with caron, U+0160 ISOlat2 { "scaron", 353 }, // latin small letter s with caron, U+0161 ISOlat2 { "Yuml", 376 }, // latin capital letter Y with diaeresis, U+0178 ISOlat2 // Spacing Modifier Letters { "circ", 710 }, // modifier letter circumflex accent, U+02C6 ISOpub { "tilde", 732 }, // small tilde, U+02DC ISOdia // General Punctuation { "ensp", 8194 }, // en space, U+2002 ISOpub { "emsp", 8195 }, // em space, U+2003 ISOpub { "thinsp", 8201 }, // thin space, U+2009 ISOpub { "zwnj", 8204 }, // zero width non-joiner, U+200C NEW RFC 2070 { "zwj", 8205 }, // zero width joiner, U+200D NEW RFC 2070 { "lrm", 8206 }, // left-to-right mark, U+200E NEW RFC 2070 { "rlm", 8207 }, // right-to-left mark, U+200F NEW RFC 2070 { "ndash", 8211 }, // en dash, U+2013 ISOpub { "mdash", 8212 }, // em dash, U+2014 ISOpub { "lsquo", 8216 }, // left single quotation mark, U+2018 ISOnum { "rsquo", 8217 }, // right single quotation mark, U+2019 ISOnum { "sbquo", 8218 }, // single low-9 quotation mark, U+201A NEW { "ldquo", 8220 }, // left double quotation mark, U+201C ISOnum { "rdquo", 8221 }, // right double quotation mark, U+201D ISOnum { "bdquo", 8222 }, // double low-9 quotation mark, U+201E NEW { "dagger", 8224 }, // dagger, U+2020 ISOpub { "Dagger", 8225 }, // double dagger, U+2021 ISOpub { "permil", 8240 }, // per mille sign, U+2030 ISOtech { "lsaquo", 8249 }, // single left-pointing angle quotation mark, U+2039 ISO // proposed: lsaquo is proposed but not yet ISO standardized { "rsaquo", 8250 }, // single right-pointing angle quotation mark, U+203A ISO // proposed: rsaquo is proposed but not yet ISO standardized { "euro", 8364 }, // euro sign, U+20AC NEW { "nbsp", 160 }, { "iexcl", 161 }, { "cent", 162 }, { "pound", 163 }, { "curren", 164 }, { "yen", 165 }, { "brvbar", 166 }, { "sect", 167 }, { "uml", 168 }, { "copy", 169 }, { "ordf", 170 }, { "laquo", 171 }, { "not", 172 }, { "shy", 173 }, { "reg", 174 }, { "macr", 175 }, { "deg", 176 }, { "plusmn", 177 }, { "sup2", 178 }, { "sup3", 179 }, { "acute", 180 }, { "micro", 181 }, { "para", 182 }, { "middot", 183 }, { "cedil", 184 }, { "sup1", 185 }, { "ordm", 186 }, { "raquo", 187 }, { "frac14", 188 }, { "frac12", 189 }, { "frac34", 190 }, { "iquest", 191 }, { "Agrave", 192 }, { "Aacute", 193 }, { "Acirc", 194 }, { "Atilde", 195 }, { "Auml", 196 }, { "Aring", 197 }, { "AElig", 198 }, { "Ccedil", 199 }, { "Egrave", 200 }, { "Eacute", 201 }, { "Ecirc", 202 }, { "Euml", 203 }, { "Igrave", 204 }, { "Iacute", 205 }, { "Icirc", 206 }, { "Iuml", 207 }, { "ETH", 208 }, { "Ntilde", 209 }, { "Ograve", 210 }, { "Oacute", 211 }, { "Ocirc", 212 }, { "Otilde", 213 }, { "Ouml", 214 }, { "times", 215 }, { "Oslash", 216 }, { "Ugrave", 217 }, { "Uacute", 218 }, { "Ucirc", 219 }, { "Uuml", 220 }, { "Yacute", 221 }, { "THORN", 222 }, { "szlig", 223 }, { "agrave", 224 }, { "aacute", 225 }, { "acirc", 226 }, { "atilde", 227 }, { "auml", 228 }, { "aring", 229 }, { "aelig", 230 }, { "ccedil", 231 }, { "egrave", 232 }, { "eacute", 233 }, { "ecirc", 234 }, { "euml", 235 }, { "igrave", 236 }, { "iacute", 237 }, { "icirc", 238 }, { "iuml", 239 }, { "eth", 240 }, { "ntilde", 241 }, { "ograve", 242 }, { "oacute", 243 }, { "ocirc", 244 }, { "otilde", 245 }, { "ouml", 246 }, { "divide", 247 }, { "oslash", 248 }, { "ugrave", 249 }, { "uacute", 250 }, { "ucirc", 251 }, { "uuml", 252 }, { "yacute", 253 }, { "thorn", 254 }, { "yuml", 255 }, { "fnof", 402 }, { "Alpha", 913 }, { "Beta", 914 }, { "Gamma", 915 }, { "Delta", 916 }, { "Epsilon", 917 }, { "Zeta", 918 }, { "Eta", 919 }, { "Theta", 920 }, { "Iota", 921 }, { "Kappa", 922 }, { "Lambda", 923 }, { "Mu", 924 }, { "Nu", 925 }, { "Xi", 926 }, { "Omicron", 927 }, { "Pi", 928 }, { "Rho", 929 }, { "Sigma", 931 }, { "Tau", 932 }, { "Upsilon", 933 }, { "Phi", 934 }, { "Chi", 935 }, { "Psi", 936 }, { "Omega", 937 }, { "alpha", 945 }, { "beta", 946 }, { "gamma", 947 }, { "delta", 948 }, { "epsilon", 949 }, { "zeta", 950 }, { "eta", 951 }, { "theta", 952 }, { "iota", 953 }, { "kappa", 954 }, { "lambda", 955 }, { "mu", 956 }, { "nu", 957 }, { "xi", 958 }, { "omicron", 959 }, { "pi", 960 }, { "rho", 961 }, { "sigmaf", 962 }, { "sigma", 963 }, { "tau", 964 }, { "upsilon", 965 }, { "phi", 966 }, { "chi", 967 }, { "psi", 968 }, { "omega", 969 }, { "thetasym", 977 }, { "upsih", 978 }, { "piv", 982 }, { "bull", 8226 }, { "hellip", 8230 }, { "prime", 8242 }, { "Prime", 8243 }, { "oline", 8254 }, { "frasl", 8260 }, { "weierp", 8472 }, { "image", 8465 }, { "real", 8476 }, { "trade", 8482 }, { "alefsym", 8501 }, { "larr", 8592 }, { "uarr", 8593 }, { "rarr", 8594 }, { "darr", 8595 }, { "harr", 8596 }, { "crarr", 8629 }, { "lArr", 8656 }, { "uArr", 8657 }, { "rArr", 8658 }, { "dArr", 8659 }, { "hArr", 8660 }, { "forall", 8704 }, { "part", 8706 }, { "exist", 8707 }, { "empty", 8709 }, { "nabla", 8711 }, { "isin", 8712 }, { "notin", 8713 }, { "ni", 8715 }, { "prod", 8719 }, { "sum", 8722 }, { "minus", 8722 }, { "lowast", 8727 }, { "radic", 8730 }, { "prop", 8733 }, { "infin", 8734 }, { "ang", 8736 }, { "and", 8869 }, { "or", 8870 }, { "cap", 8745 }, { "cup", 8746 }, { "int", 8747 }, { "there4", 8756 }, { "sim", 8764 }, { "cong", 8773 }, { "asymp", 8773 }, { "ne", 8800 }, { "equiv", 8801 }, { "le", 8804 }, { "ge", 8805 }, { "sub", 8834 }, { "sup", 8835 }, { "nsub", 8836 }, { "sube", 8838 }, { "supe", 8839 }, { "oplus", 8853 }, { "otimes", 8855 }, { "perp", 8869 }, { "sdot", 8901 }, { "lceil", 8968 }, { "rceil", 8969 }, { "lfloor", 8970 }, { "rfloor", 8971 }, { "lang", 9001 }, { "rang", 9002 }, { "loz", 9674 }, { "spades", 9824 }, { "clubs", 9827 }, { "hearts", 9829 }, { "diams", 9830 } }; /** Converts HTML entities to normal characters */ protected String entitiesToChars(String str) { int strlen = str.length(); StringBuilder res = new StringBuilder(strlen); for (int cp, i = 0; i < strlen; i += Character.charCount(cp)) { cp = str.codePointAt(i); switch (cp) { case '&': int cp1; // if there's one more symbol, reading it, // otherwise it's a dangling '&' if (str.codePointCount(i, strlen) < 2) { res.appendCodePoint(cp); break; } else { cp1 = str.codePointAt(str.offsetByCodePoints(i, 1)); } if (cp1 == '#') { // numeric entity int cp2 = str.codePointAt(str.offsetByCodePoints(i, 2)); if (cp2 == 'x' || cp2 == 'X') { // hex numeric entity int hexStart = str.offsetByCodePoints(i, 3); int hexEnd = hexStart; while (hexEnd < strlen) { int hexCp = str.codePointAt(hexEnd); if (!isHexDigit(hexCp)) { break; } hexEnd += Character.charCount(hexCp); } String sEntity = str.substring(hexStart, hexEnd); try { int nEntity = Integer.parseInt(sEntity, 16); if (nEntity > 0 && nEntity <= 0x10FFFF) { res.appendCodePoint(nEntity); if (hexEnd < strlen && str.codePointAt(hexEnd) == ';') { i = hexEnd; } else { i = str.offsetByCodePoints(hexEnd, -1); } } else { // too big number // dangling '&' res.appendCodePoint(cp); } } catch (NumberFormatException nfe) { // do nothing // dangling '&' res.appendCodePoint(cp); } } else { // decimal entity int decStart = str.offsetByCodePoints(i, 2); int decEnd = decStart; while (decEnd < strlen) { int decCp = str.codePointAt(decEnd); if (!isDecimalDigit(decCp)) { break; } decEnd += Character.charCount(decCp); } String sEntity = str.substring(decStart, decEnd); try { int nEntity = Integer.parseInt(sEntity, 10); if (nEntity > 0 && nEntity <= 0x10FFFF) { res.appendCodePoint(nEntity); if (decEnd < strlen && str.codePointAt(decEnd) == ';') { i = decEnd; } else { i = str.offsetByCodePoints(decEnd, -1); } } else { // too big number // dangling '&' res.appendCodePoint(cp); } } catch (NumberFormatException nfe) { // do nothing // dangling '&' res.appendCodePoint(cp); } } } else if (isLatinLetter(cp1)) { // named entity? int entStart = str.offsetByCodePoints(i, 1); int entEnd = entStart; while (entEnd < strlen) { int entCp = str.codePointAt(entEnd); // Some entities contain numbers, e.g. frac12 if (!isLatinLetter(entCp) && !isDecimalDigit(entCp)) { break; } entEnd += Character.charCount(entCp); } String sEntity = str.substring(entStart, entEnd); int nEntity = lookupEntity(sEntity); if (nEntity > 0 && nEntity <= 65535) { res.append((char) nEntity); if (entEnd < strlen && str.codePointAt(entEnd) == ';') { i = entEnd; } else { i = str.offsetByCodePoints(entEnd, -1); } } else { // too big number // dangling '&' res.appendCodePoint(cp); } } else { // dangling '&' res.appendCodePoint(cp); } break; default: res.appendCodePoint(cp); } } return res.toString(); } /** Returns true if a char is a latin letter */ private boolean isLatinLetter(int ch) { return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'); } /** Returns true if a char is a decimal digit */ private boolean isDecimalDigit(int ch) { return (ch >= '0' && ch <= '9'); } /** Returns true if a char is a hex digit */ private boolean isHexDigit(int ch) { return (ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F'); } /** * returns a character for HTML entity, or -1 if the passed string is not an * entity */ private int lookupEntity(String entity) { for (int i = 0; i < ENTITIES.length; i++) { Object[] onent = ENTITIES[i]; if (entity.equals(onent[0])) { return ((Integer) onent[1]).intValue(); } } return -1; } /** * Converts characters that must be converted (< > & ' ' * (nbsp)) into HTML entities */ protected String charsToEntities(String str) { int strlen = str.length(); StringBuilder res = new StringBuilder(strlen * 5); for (int cp, i = 0; i < strlen; i += Character.charCount(cp)) { cp = str.codePointAt(i); switch (cp) { case '\u00A0': res.append(" "); break; case '&': res.append("&"); break; case '>': // If it's the end of a processing instruction if ((i > 0) && str.codePointBefore(i) == '?') { res.append(">"); } else { res.append(">"); } break; case '<': int qMarkPos = str.indexOf('?', i); // If it's the beginning of a processing instruction if (qMarkPos == str.offsetByCodePoints(i, 1)) { res.append("<"); break; } int gtpos = str.indexOf('>', i); if (gtpos >= 0) { String maybeShortcut = str.substring(i, str.offsetByCodePoints(gtpos, 1)); boolean foundShortcut = false; // here because it's // impossible to step out of // two loops at once for (String currShortcut : sShortcuts) { if (maybeShortcut.equals(currShortcut)) { // skipping the conversion of < into < // because it's a part of the tag foundShortcut = true; break; } } if (foundShortcut) { res.append(maybeShortcut); i = gtpos; continue; } else { // dangling < res.append("<"); } } else { // dangling < res.append("<"); } break; default: res.appendCodePoint(cp); } } String contents = res.toString(); // Rewrite characters that cannot be encoded to html character strings. // Each character in the contents-string is checked. If a character // can't be encoded, all its occurrences are replaced with the // html-equivalent string. // Then, the next character is checked. // (The loop over the contents-string is restarted for the modified // content, but the starting-position will be the position where the // last unencodable character was found) // [1802000] HTML filter loses html-encoded characters if not supported String encoding = this.filter.getTargetEncoding(); if (encoding != null) { CharsetEncoder charsetEncoder = Charset.forName(encoding).newEncoder(); int i = 0; while (true) { String substring; for (int cp; i < contents.length(); i += substring.length()) { cp = contents.codePointAt(i); substring = contents.substring(i, i + Character.charCount(cp)); if (!charsetEncoder.canEncode(substring)) { String replacement = "&#" + cp + ';'; contents = contents.replaceAll(Pattern.quote(substring), replacement); break; } } if (i == contents.length()) { break; } } } return contents; } }