/** * Copyright © 2002 Instituto Superior Técnico * * This file is part of FenixEdu Academic. * * FenixEdu Academic is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * FenixEdu Academic is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with FenixEdu Academic. If not, see <http://www.gnu.org/licenses/>. */ package org.fenixedu.academic.ui.renderers.htmlEditor; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.Writer; import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.Map; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.w3c.tidy.Tidy; import pt.ist.fenixWebFramework.renderers.components.converters.ConversionException; /** * This converter converts an HTML fragment to plain text while preserving some * of the formatting like paragraphs, lists, quotations, smiles, etc. * * @author cfgi */ public class HtmlToTextConverter extends TidyConverter { private static final Logger logger = LoggerFactory.getLogger(HtmlToTextConverter.class); private static final String DEFAULT_INDENT = " "; private final StringBuilder buffer; private int pos; private boolean wrap; private int lineLength; public HtmlToTextConverter() { super(); this.pos = 0; this.buffer = new StringBuilder(); this.wrap = true; this.lineLength = 80; } public int getLineLength() { return this.lineLength; } /** * Sets the line length used when wrapping text. This value is ignored if {@link #isWrap()} returns <code>false</code>. */ public void setLineLength(int lineLength) { this.lineLength = lineLength; } /** * If this converter is wrapping text acording to the line length specified * with {@link #setLineLength(int)}. */ public boolean isWrap() { return this.wrap; } /** * Chooses wether this converter should do line wrapping or not. */ public void setWrap(boolean wrap) { this.wrap = wrap; } @Override protected void parseDocument(OutputStream outStream, Tidy tidy, Document document) { tidy.setPrintBodyOnly(false); parseNode(tidy, document, ""); try { Writer writer = new OutputStreamWriter(outStream, StandardCharsets.UTF_8); writer.write(this.buffer.toString()); writer.flush(); } catch (IOException e) { logger.error(e.getMessage(), e); throw new ConversionException("renderers.converter.text.write"); } } private void parseNode(Tidy tidy, Node node, String indent) { switch (node.getNodeType()) { case Node.DOCUMENT_NODE: parseNodeChildren(tidy, node, indent); break; case Node.ELEMENT_NODE: Element element = (Element) node; String name = element.getNodeName().toLowerCase(); if (name.equals("p")) { ensureBlankLine(); addCodeText(indent); parseNodeChildren(tidy, element, indent); ensureBlankLine(); addCodeText(indent); } else if (name.equals("blockquote")) { ensureBlankLine(); addCodeText(indent + DEFAULT_INDENT); parseNodeChildren(tidy, element, indent + DEFAULT_INDENT); ensureBlankLine(); addCodeText(indent); } else if (name.equals("ul") || name.equals("ol")) { ensureLineBreak(); parseList(tidy, element, name.equals("ol"), indent); ensureLineBreak(); addCodeText(indent); } else if (name.equals("br")) { addLineBreak(); addCodeText(indent); } else if (name.equals("hr")) { ensureLineBreak(); addText("----------", indent); ensureLineBreak(); addCodeText(indent); } else if (name.equals("pre")) { ensureBlankLine(); addCodeText(indent); addCodeText(getChildTextContent(tidy, element)); ensureBlankLine(); addCodeText(indent); } else if (name.equals("code")) { addCodeText(getChildTextContent(tidy, element)); } else if (name.equals("a")) { parseNodeChildren(tidy, element, indent); addText("(" + element.getAttribute("href") + ")", indent); } else if (name.equals("img")) { parseSmile(tidy, element, indent); } else { parseNodeChildren(tidy, node, indent); } break; case Node.TEXT_NODE: addText(getTextContent(tidy, node), indent); break; default: break; } } private void parseList(Tidy tidy, Element element, boolean ordered, String indent) { NodeList itemList = element.getChildNodes(); for (int i = 0; i < itemList.getLength(); i++) { Node item = itemList.item(i); if (item.getNodeType() != Node.ELEMENT_NODE || !item.getNodeName().equalsIgnoreCase("li")) { continue; } addCodeText(indent + DEFAULT_INDENT); addText(ordered ? String.valueOf(i + 1) + ". " : "* ", indent); parseNodeChildren(tidy, item, indent + DEFAULT_INDENT); addLineBreak(); } } private static final Map<String, String> emoticons; static { emoticons = new HashMap<String, String>(); emoticons.put("cool", "B-)"); emoticons.put("cry", ":'-("); emoticons.put("embarassed", ":-$"); emoticons.put("foot-in-mouth", ":-!"); emoticons.put("frown", ":-("); emoticons.put("innocent", "O:-)"); emoticons.put("kiss", ":-*"); emoticons.put("laughing", ":-D"); emoticons.put("money-mouth", ":-$"); emoticons.put("sealed", ":-x"); emoticons.put("suprised", ":-o"); emoticons.put("tongue-out", ":-P"); emoticons.put("undecided", ":-/"); emoticons.put("wink", ";-)"); emoticons.put("yell", ":-O"); } private void parseSmile(Tidy tidy, Element element, String indent) { String source = element.getAttribute("src"); if (source == null) { return; } if (!source.matches(".*?smiley-[^.]+\\.gif")) { // TODO: check this // convention return; } int indexStart = source.lastIndexOf("smiley-") + "smiley-".length(); int indexEnd = source.lastIndexOf("."); String smiley = source.substring(indexStart, indexEnd); String emoticon = emoticons.get(smiley); if (emoticon != null) { addText(emoticon, indent); } } private String getTextContent(Tidy tidy, Node node) { ByteArrayOutputStream outStream = new ByteArrayOutputStream(); tidy.pprint(node, outStream); try { outStream.flush(); } catch (IOException e) { logger.error(e.getMessage(), e); throw new ConversionException("renderers.converter.text.write"); } return new String(outStream.toByteArray(), StandardCharsets.UTF_8); } private String getChildTextContent(Tidy tidy, Node node) { StringBuilder builder = new StringBuilder(); NodeList children = node.getChildNodes(); for (int i = 0; i < children.getLength(); i++) { builder.append(getTextContent(tidy, children.item(i))); } return builder.toString(); } private void parseNodeChildren(Tidy tidy, Node node, String indent) { NodeList children = node.getChildNodes(); for (int i = 0; i < children.getLength(); i++) { parseNode(tidy, children.item(i), indent); } } private void addText(String htmlText, String indent) { if (htmlText == null) { return; } String text = unescapeHtml(htmlText); String[] words = text.split("\\p{Space}+"); for (String word : words) { if (word.length() == 0) { continue; } if (pos + word.length() + 1 > getLineLength()) { buffer.append("\n" + indent); this.buffer.append(word + " "); pos = indent.length() + word.length() + 1; } else { this.buffer.append(word + " "); pos += word.length() + 1; } } } private String unescapeHtml(String htmlText) { String text = htmlText; text = unhtmlentities(text); text = unhtmlAmpersand(text); text = unhtmlAngleBrackets(text); text = unhtmlQuotes(text); return text; } private static String unhtmlQuotes(String str) { str = unhtmlDoubleQuotes(str); //convert double quotes str = unhtmlSingleQuotes(str); //convert single quotes return str; } private static String unhtmlSingleQuotes(String str) { return str.replaceAll("’", "\'"); } private static String unhtmlDoubleQuotes(String str) { return str.replaceAll(""", "\""); } private static String unhtmlAngleBrackets(String str) { str = str.replaceAll("<", "<"); str = str.replaceAll(">", ">"); return str; } private static String unhtmlAmpersand(String str) { return str.replaceAll("&", "&"); } private static String unhtmlentities(String str) { //initialize html translation maps table the first time is called if (unhtmlentities_map.isEmpty()) { initializeEntitiesTables(); } StringBuffer buf = new StringBuffer(); for (int i = 0; i < str.length(); ++i) { char ch = str.charAt(i); if (ch == '&') { int semi = str.indexOf(';', i + 1); if ((semi == -1) || ((semi - i) > 7)) { buf.append(ch); continue; } String entity = str.substring(i, semi + 1); Integer iso; if (entity.charAt(1) == ' ') { buf.append(ch); continue; } if (entity.charAt(1) == '#') { if (entity.charAt(2) == 'x') { iso = Integer.valueOf(Integer.parseInt(entity.substring(3, entity.length() - 1), 16)); } else { iso = Integer.valueOf(entity.substring(2, entity.length() - 1)); } } else { iso = unhtmlentities_map.get(entity); } if (iso == null) { buf.append(entity); } else { buf.append((char) (iso.intValue())); } i = semi; } else { buf.append(ch); } } return buf.toString(); } private static void initializeEntitiesTables() { // initialize html translation maps for (Object[] element : html_entities_table) { unhtmlentities_map.put((String) element[0], (Integer) element[1]); } } private static final Object[][] html_entities_table = { { "Á", 193 }, { "á", 225 }, { "Â", 194 }, { "â", 226 }, { "´", 180 }, { "Æ", 198 }, { "æ", 230 }, { "À", 192 }, { "à", 224 }, { "ℵ", 8501 }, { "Α", 913 }, { "α", 945 }, { "&", 38 }, { "∧", 8743 }, { "∠", 8736 }, { "Å", 197 }, { "å", 229 }, { "≈", 8776 }, { "Ã", 195 }, { "ã", 227 }, { "Ä", 196 }, { "ä", 228 }, { "„", 8222 }, { "Β", 914 }, { "β", 946 }, { "¦", 166 }, { "•", 8226 }, { "∩", 8745 }, { "Ç", 199 }, { "ç", 231 }, { "¸", 184 }, { "¢", 162 }, { "Χ", 935 }, { "χ", 967 }, { "ˆ", 710 }, { "♣", 9827 }, { "≅", 8773 }, { "©", 169 }, { "↵", 8629 }, { "∪", 8746 }, { "¤", 164 }, { "†", 8224 }, { "‡", 8225 }, { "↓", 8595 }, { "⇓", 8659 }, { "°", 176 }, { "Δ", 916 }, { "δ", 948 }, { "♦", 9830 }, { "÷", 247 }, { "É", 201 }, { "é", 233 }, { "Ê", 202 }, { "ê", 234 }, { "È", 200 }, { "è", 232 }, { "∅", 8709 }, { " ", 8195 }, { " ", 8194 }, { "Ε", 917 }, { "ε", 949 }, { "≡", 8801 }, { "Η", 919 }, { "η", 951 }, { "Ð", 208 }, { "ð", 240 }, { "Ë", 203 }, { "ë", 235 }, { "€", 8364 }, { "∃", 8707 }, { "ƒ", 402 }, { "∀", 8704 }, { "½", 189 }, { "¼", 188 }, { "¾", 190 }, { "⁄", 8260 }, { "Γ", 915 }, { "γ", 947 }, { "≥", 8805 }, { "↔", 8596 }, { "⇔", 8660 }, { "♥", 9829 }, { "…", 8230 }, { "Í", 205 }, { "í", 237 }, { "Î", 206 }, { "î", 238 }, { "¡", 161 }, { "Ì", 204 }, { "ì", 236 }, { "ℑ", 8465 }, { "∞", 8734 }, { "∫", 8747 }, { "Ι", 921 }, { "ι", 953 }, { "¿", 191 }, { "∈", 8712 }, { "Ï", 207 }, { "ï", 239 }, { "Κ", 922 }, { "κ", 954 }, { "Λ", 923 }, { "λ", 955 }, { "⟨", 9001 }, { "«", 171 }, { "←", 8592 }, { "⇐", 8656 }, { "⌈", 8968 }, { "“", 8220 }, { "≤", 8804 }, { "⌊", 8970 }, { "∗", 8727 }, { "◊", 9674 }, { "‎", 8206 }, { "‹", 8249 }, { "‘", 8216 }, { "¯", 175 }, { "—", 8212 }, { "µ", 181 }, { "·", 183 }, { "−", 8722 }, { "Μ", 924 }, { "μ", 956 }, { "∇", 8711 }, { " ", 160 }, { "–", 8211 }, { "≠", 8800 }, { "∋", 8715 }, { "¬", 172 }, { "∉", 8713 }, { "⊄", 8836 }, { "Ñ", 209 }, { "ñ", 241 }, { "Ν", 925 }, { "ν", 957 }, { "Ó", 211 }, { "ó", 243 }, { "Ô", 212 }, { "ô", 244 }, { "Œ", 338 }, { "œ", 339 }, { "Ò", 210 }, { "ò", 242 }, { "‾", 8254 }, { "Ω", 937 }, { "ω", 969 }, { "Ο", 927 }, { "ο", 959 }, { "⊕", 8853 }, { "∨", 8744 }, { "ª", 170 }, { "º", 186 }, { "Ø", 216 }, { "ø", 248 }, { "Õ", 213 }, { "õ", 245 }, { "⊗", 8855 }, { "Ö", 214 }, { "ö", 246 }, { "¶", 182 }, { "∂", 8706 }, { "‰", 8240 }, { "⊥", 8869 }, { "Φ", 934 }, { "φ", 966 }, { "Π", 928 }, { "π", 960 }, { "ϖ", 982 }, { "±", 177 }, { "£", 163 }, { "′", 8242 }, { "″", 8243 }, { "∏", 8719 }, { "∝", 8733 }, { "Ψ", 936 }, { "ψ", 968 }, { "√", 8730 }, { "⟩", 9002 }, { "»", 187 }, { "→", 8594 }, { "⇒", 8658 }, { "⌉", 8969 }, { "”", 8221 }, { "ℜ", 8476 }, { "®", 174 }, { "⌋", 8971 }, { "Ρ", 929 }, { "ρ", 961 }, { "‏", 8207 }, { "›", 8250 }, { "’", 8217 }, { "‚", 8218 }, { "Š", 352 }, { "š", 353 }, { "⋅", 8901 }, { "§", 167 }, { "­", 173 }, { "Σ", 931 }, { "σ", 963 }, { "ς", 962 }, { "∼", 8764 }, { "♠", 9824 }, { "⊂", 8834 }, { "⊆", 8838 }, { "∑", 8721 }, { "¹", 185 }, { "²", 178 }, { "³", 179 }, { "⊃", 8835 }, { "⊇", 8839 }, { "ß", 223 }, { "Τ", 932 }, { "τ", 964 }, { "∴", 8756 }, { "Θ", 920 }, { "θ", 952 }, { "ϑ", 977 }, { " ", 8201 }, { "Þ", 222 }, { "þ", 254 }, { "˜", 732 }, { "×", 215 }, { "™", 8482 }, { "Ú", 218 }, { "ú", 250 }, { "↑", 8593 }, { "⇑", 8657 }, { "Û", 219 }, { "û", 251 }, { "Ù", 217 }, { "ù", 249 }, { "¨", 168 }, { "ϒ", 978 }, { "Υ", 933 }, { "υ", 965 }, { "Ü", 220 }, { "ü", 252 }, { "℘", 8472 }, { "Ξ", 926 }, { "ξ", 958 }, { "Ý", 221 }, { "ý", 253 }, { "¥", 165 }, { "ÿ", 255 }, { "Ÿ", 376 }, { "Ζ", 918 }, { "ζ", 950 }, { "‍", 8205 }, { "‌", 8204 } }; private static final Map<String, Integer> unhtmlentities_map = new HashMap<String, Integer>(); private void addCodeText(String htmlText) { if (htmlText == null) { return; } String text = unescapeHtml(htmlText); this.buffer.append(text); pos += text.length() + 1; } private void addLineBreak() { buffer.append("\n"); pos = 0; } private void ensureLineBreak() { if (buffer.length() == 0) { return; } if (buffer.lastIndexOf("\n") == buffer.length() - 1) { return; } addLineBreak(); } private void ensureBlankLine() { if (buffer.length() == 0) { return; } ensureLineBreak(); if (buffer.lastIndexOf("\n\n") == buffer.length() - 2) { return; } addLineBreak(); } }