/**
* Copyright © 2002 Instituto Superior Técnico
*
* This file is part of FenixEdu Academic.
*
* FenixEdu Academic is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* FenixEdu Academic is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with FenixEdu Academic. If not, see <http://www.gnu.org/licenses/>.
*/
package org.fenixedu.academic.ui.renderers.htmlEditor;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.tidy.Tidy;
import pt.ist.fenixWebFramework.renderers.components.converters.ConversionException;
/**
* This converter converts an HTML fragment to plain text while preserving some
* of the formatting like paragraphs, lists, quotations, smiles, etc.
*
* @author cfgi
*/
public class HtmlToTextConverter extends TidyConverter {
private static final Logger logger = LoggerFactory.getLogger(HtmlToTextConverter.class);
private static final String DEFAULT_INDENT = " ";
private final StringBuilder buffer;
private int pos;
private boolean wrap;
private int lineLength;
public HtmlToTextConverter() {
super();
this.pos = 0;
this.buffer = new StringBuilder();
this.wrap = true;
this.lineLength = 80;
}
public int getLineLength() {
return this.lineLength;
}
/**
* Sets the line length used when wrapping text. This value is ignored if {@link #isWrap()} returns <code>false</code>.
*/
public void setLineLength(int lineLength) {
this.lineLength = lineLength;
}
/**
* If this converter is wrapping text acording to the line length specified
* with {@link #setLineLength(int)}.
*/
public boolean isWrap() {
return this.wrap;
}
/**
* Chooses wether this converter should do line wrapping or not.
*/
public void setWrap(boolean wrap) {
this.wrap = wrap;
}
@Override
protected void parseDocument(OutputStream outStream, Tidy tidy, Document document) {
tidy.setPrintBodyOnly(false);
parseNode(tidy, document, "");
try {
Writer writer = new OutputStreamWriter(outStream, StandardCharsets.UTF_8);
writer.write(this.buffer.toString());
writer.flush();
} catch (IOException e) {
logger.error(e.getMessage(), e);
throw new ConversionException("renderers.converter.text.write");
}
}
private void parseNode(Tidy tidy, Node node, String indent) {
switch (node.getNodeType()) {
case Node.DOCUMENT_NODE:
parseNodeChildren(tidy, node, indent);
break;
case Node.ELEMENT_NODE:
Element element = (Element) node;
String name = element.getNodeName().toLowerCase();
if (name.equals("p")) {
ensureBlankLine();
addCodeText(indent);
parseNodeChildren(tidy, element, indent);
ensureBlankLine();
addCodeText(indent);
} else if (name.equals("blockquote")) {
ensureBlankLine();
addCodeText(indent + DEFAULT_INDENT);
parseNodeChildren(tidy, element, indent + DEFAULT_INDENT);
ensureBlankLine();
addCodeText(indent);
} else if (name.equals("ul") || name.equals("ol")) {
ensureLineBreak();
parseList(tidy, element, name.equals("ol"), indent);
ensureLineBreak();
addCodeText(indent);
} else if (name.equals("br")) {
addLineBreak();
addCodeText(indent);
} else if (name.equals("hr")) {
ensureLineBreak();
addText("----------", indent);
ensureLineBreak();
addCodeText(indent);
} else if (name.equals("pre")) {
ensureBlankLine();
addCodeText(indent);
addCodeText(getChildTextContent(tidy, element));
ensureBlankLine();
addCodeText(indent);
} else if (name.equals("code")) {
addCodeText(getChildTextContent(tidy, element));
} else if (name.equals("a")) {
parseNodeChildren(tidy, element, indent);
addText("(" + element.getAttribute("href") + ")", indent);
} else if (name.equals("img")) {
parseSmile(tidy, element, indent);
} else {
parseNodeChildren(tidy, node, indent);
}
break;
case Node.TEXT_NODE:
addText(getTextContent(tidy, node), indent);
break;
default:
break;
}
}
private void parseList(Tidy tidy, Element element, boolean ordered, String indent) {
NodeList itemList = element.getChildNodes();
for (int i = 0; i < itemList.getLength(); i++) {
Node item = itemList.item(i);
if (item.getNodeType() != Node.ELEMENT_NODE || !item.getNodeName().equalsIgnoreCase("li")) {
continue;
}
addCodeText(indent + DEFAULT_INDENT);
addText(ordered ? String.valueOf(i + 1) + ". " : "* ", indent);
parseNodeChildren(tidy, item, indent + DEFAULT_INDENT);
addLineBreak();
}
}
private static final Map<String, String> emoticons;
static {
emoticons = new HashMap<String, String>();
emoticons.put("cool", "B-)");
emoticons.put("cry", ":'-(");
emoticons.put("embarassed", ":-$");
emoticons.put("foot-in-mouth", ":-!");
emoticons.put("frown", ":-(");
emoticons.put("innocent", "O:-)");
emoticons.put("kiss", ":-*");
emoticons.put("laughing", ":-D");
emoticons.put("money-mouth", ":-$");
emoticons.put("sealed", ":-x");
emoticons.put("suprised", ":-o");
emoticons.put("tongue-out", ":-P");
emoticons.put("undecided", ":-/");
emoticons.put("wink", ";-)");
emoticons.put("yell", ":-O");
}
private void parseSmile(Tidy tidy, Element element, String indent) {
String source = element.getAttribute("src");
if (source == null) {
return;
}
if (!source.matches(".*?smiley-[^.]+\\.gif")) { // TODO: check this
// convention
return;
}
int indexStart = source.lastIndexOf("smiley-") + "smiley-".length();
int indexEnd = source.lastIndexOf(".");
String smiley = source.substring(indexStart, indexEnd);
String emoticon = emoticons.get(smiley);
if (emoticon != null) {
addText(emoticon, indent);
}
}
private String getTextContent(Tidy tidy, Node node) {
ByteArrayOutputStream outStream = new ByteArrayOutputStream();
tidy.pprint(node, outStream);
try {
outStream.flush();
} catch (IOException e) {
logger.error(e.getMessage(), e);
throw new ConversionException("renderers.converter.text.write");
}
return new String(outStream.toByteArray(), StandardCharsets.UTF_8);
}
private String getChildTextContent(Tidy tidy, Node node) {
StringBuilder builder = new StringBuilder();
NodeList children = node.getChildNodes();
for (int i = 0; i < children.getLength(); i++) {
builder.append(getTextContent(tidy, children.item(i)));
}
return builder.toString();
}
private void parseNodeChildren(Tidy tidy, Node node, String indent) {
NodeList children = node.getChildNodes();
for (int i = 0; i < children.getLength(); i++) {
parseNode(tidy, children.item(i), indent);
}
}
private void addText(String htmlText, String indent) {
if (htmlText == null) {
return;
}
String text = unescapeHtml(htmlText);
String[] words = text.split("\\p{Space}+");
for (String word : words) {
if (word.length() == 0) {
continue;
}
if (pos + word.length() + 1 > getLineLength()) {
buffer.append("\n" + indent);
this.buffer.append(word + " ");
pos = indent.length() + word.length() + 1;
} else {
this.buffer.append(word + " ");
pos += word.length() + 1;
}
}
}
private String unescapeHtml(String htmlText) {
String text = htmlText;
text = unhtmlentities(text);
text = unhtmlAmpersand(text);
text = unhtmlAngleBrackets(text);
text = unhtmlQuotes(text);
return text;
}
private static String unhtmlQuotes(String str) {
str = unhtmlDoubleQuotes(str); //convert double quotes
str = unhtmlSingleQuotes(str); //convert single quotes
return str;
}
private static String unhtmlSingleQuotes(String str) {
return str.replaceAll("’", "\'");
}
private static String unhtmlDoubleQuotes(String str) {
return str.replaceAll(""", "\"");
}
private static String unhtmlAngleBrackets(String str) {
str = str.replaceAll("<", "<");
str = str.replaceAll(">", ">");
return str;
}
private static String unhtmlAmpersand(String str) {
return str.replaceAll("&", "&");
}
private static String unhtmlentities(String str) {
//initialize html translation maps table the first time is called
if (unhtmlentities_map.isEmpty()) {
initializeEntitiesTables();
}
StringBuffer buf = new StringBuffer();
for (int i = 0; i < str.length(); ++i) {
char ch = str.charAt(i);
if (ch == '&') {
int semi = str.indexOf(';', i + 1);
if ((semi == -1) || ((semi - i) > 7)) {
buf.append(ch);
continue;
}
String entity = str.substring(i, semi + 1);
Integer iso;
if (entity.charAt(1) == ' ') {
buf.append(ch);
continue;
}
if (entity.charAt(1) == '#') {
if (entity.charAt(2) == 'x') {
iso = Integer.valueOf(Integer.parseInt(entity.substring(3, entity.length() - 1), 16));
} else {
iso = Integer.valueOf(entity.substring(2, entity.length() - 1));
}
} else {
iso = unhtmlentities_map.get(entity);
}
if (iso == null) {
buf.append(entity);
} else {
buf.append((char) (iso.intValue()));
}
i = semi;
} else {
buf.append(ch);
}
}
return buf.toString();
}
private static void initializeEntitiesTables() {
// initialize html translation maps
for (Object[] element : html_entities_table) {
unhtmlentities_map.put((String) element[0], (Integer) element[1]);
}
}
private static final Object[][] html_entities_table = { { "Á", 193 }, { "á", 225 }, { "Â", 194 },
{ "â", 226 }, { "´", 180 }, { "Æ", 198 }, { "æ", 230 }, { "À", 192 },
{ "à", 224 }, { "ℵ", 8501 }, { "Α", 913 }, { "α", 945 }, { "&", 38 },
{ "∧", 8743 }, { "∠", 8736 }, { "Å", 197 }, { "å", 229 }, { "≈", 8776 },
{ "Ã", 195 }, { "ã", 227 }, { "Ä", 196 }, { "ä", 228 }, { "„", 8222 },
{ "Β", 914 }, { "β", 946 }, { "¦", 166 }, { "•", 8226 }, { "∩", 8745 },
{ "Ç", 199 }, { "ç", 231 }, { "¸", 184 }, { "¢", 162 }, { "Χ", 935 }, { "χ", 967 },
{ "ˆ", 710 }, { "♣", 9827 }, { "≅", 8773 }, { "©", 169 }, { "↵", 8629 },
{ "∪", 8746 }, { "¤", 164 }, { "†", 8224 }, { "‡", 8225 }, { "↓", 8595 },
{ "⇓", 8659 }, { "°", 176 }, { "Δ", 916 }, { "δ", 948 }, { "♦", 9830 },
{ "÷", 247 }, { "É", 201 }, { "é", 233 }, { "Ê", 202 }, { "ê", 234 },
{ "È", 200 }, { "è", 232 }, { "∅", 8709 }, { " ", 8195 }, { " ", 8194 },
{ "Ε", 917 }, { "ε", 949 }, { "≡", 8801 }, { "Η", 919 }, { "η", 951 },
{ "Ð", 208 }, { "ð", 240 }, { "Ë", 203 }, { "ë", 235 }, { "€", 8364 }, { "∃", 8707 },
{ "ƒ", 402 }, { "∀", 8704 }, { "½", 189 }, { "¼", 188 }, { "¾", 190 },
{ "⁄", 8260 }, { "Γ", 915 }, { "γ", 947 }, { "≥", 8805 }, { "↔", 8596 },
{ "⇔", 8660 }, { "♥", 9829 }, { "…", 8230 }, { "Í", 205 }, { "í", 237 },
{ "Î", 206 }, { "î", 238 }, { "¡", 161 }, { "Ì", 204 }, { "ì", 236 },
{ "ℑ", 8465 }, { "∞", 8734 }, { "∫", 8747 }, { "Ι", 921 }, { "ι", 953 },
{ "¿", 191 }, { "∈", 8712 }, { "Ï", 207 }, { "ï", 239 }, { "Κ", 922 },
{ "κ", 954 }, { "Λ", 923 }, { "λ", 955 }, { "〈", 9001 }, { "«", 171 },
{ "←", 8592 }, { "⇐", 8656 }, { "⌈", 8968 }, { "“", 8220 }, { "≤", 8804 },
{ "⌊", 8970 }, { "∗", 8727 }, { "◊", 9674 }, { "", 8206 }, { "‹", 8249 },
{ "‘", 8216 }, { "¯", 175 }, { "—", 8212 }, { "µ", 181 }, { "·", 183 },
{ "−", 8722 }, { "Μ", 924 }, { "μ", 956 }, { "∇", 8711 }, { " ", 160 }, { "–", 8211 },
{ "≠", 8800 }, { "∋", 8715 }, { "¬", 172 }, { "∉", 8713 }, { "⊄", 8836 }, { "Ñ", 209 },
{ "ñ", 241 }, { "Ν", 925 }, { "ν", 957 }, { "Ó", 211 }, { "ó", 243 }, { "Ô", 212 },
{ "ô", 244 }, { "Œ", 338 }, { "œ", 339 }, { "Ò", 210 }, { "ò", 242 },
{ "‾", 8254 }, { "Ω", 937 }, { "ω", 969 }, { "Ο", 927 }, { "ο", 959 },
{ "⊕", 8853 }, { "∨", 8744 }, { "ª", 170 }, { "º", 186 }, { "Ø", 216 },
{ "ø", 248 }, { "Õ", 213 }, { "õ", 245 }, { "⊗", 8855 }, { "Ö", 214 },
{ "ö", 246 }, { "¶", 182 }, { "∂", 8706 }, { "‰", 8240 }, { "⊥", 8869 }, { "Φ", 934 },
{ "φ", 966 }, { "Π", 928 }, { "π", 960 }, { "ϖ", 982 }, { "±", 177 }, { "£", 163 },
{ "′", 8242 }, { "″", 8243 }, { "∏", 8719 }, { "∝", 8733 }, { "Ψ", 936 }, { "ψ", 968 },
{ "√", 8730 }, { "〉", 9002 }, { "»", 187 }, { "→", 8594 }, { "⇒", 8658 },
{ "⌉", 8969 }, { "”", 8221 }, { "ℜ", 8476 }, { "®", 174 }, { "⌋", 8971 },
{ "Ρ", 929 }, { "ρ", 961 }, { "", 8207 }, { "›", 8250 }, { "’", 8217 },
{ "‚", 8218 }, { "Š", 352 }, { "š", 353 }, { "⋅", 8901 }, { "§", 167 },
{ "", 173 }, { "Σ", 931 }, { "σ", 963 }, { "ς", 962 }, { "∼", 8764 },
{ "♠", 9824 }, { "⊂", 8834 }, { "⊆", 8838 }, { "∑", 8721 }, { "¹", 185 }, { "²", 178 },
{ "³", 179 }, { "⊃", 8835 }, { "⊇", 8839 }, { "ß", 223 }, { "Τ", 932 }, { "τ", 964 },
{ "∴", 8756 }, { "Θ", 920 }, { "θ", 952 }, { "ϑ", 977 }, { " ", 8201 },
{ "Þ", 222 }, { "þ", 254 }, { "˜", 732 }, { "×", 215 }, { "™", 8482 },
{ "Ú", 218 }, { "ú", 250 }, { "↑", 8593 }, { "⇑", 8657 }, { "Û", 219 },
{ "û", 251 }, { "Ù", 217 }, { "ù", 249 }, { "¨", 168 }, { "ϒ", 978 },
{ "Υ", 933 }, { "υ", 965 }, { "Ü", 220 }, { "ü", 252 }, { "℘", 8472 },
{ "Ξ", 926 }, { "ξ", 958 }, { "Ý", 221 }, { "ý", 253 }, { "¥", 165 }, { "ÿ", 255 },
{ "Ÿ", 376 }, { "Ζ", 918 }, { "ζ", 950 }, { "", 8205 }, { "", 8204 } };
private static final Map<String, Integer> unhtmlentities_map = new HashMap<String, Integer>();
private void addCodeText(String htmlText) {
if (htmlText == null) {
return;
}
String text = unescapeHtml(htmlText);
this.buffer.append(text);
pos += text.length() + 1;
}
private void addLineBreak() {
buffer.append("\n");
pos = 0;
}
private void ensureLineBreak() {
if (buffer.length() == 0) {
return;
}
if (buffer.lastIndexOf("\n") == buffer.length() - 1) {
return;
}
addLineBreak();
}
private void ensureBlankLine() {
if (buffer.length() == 0) {
return;
}
ensureLineBreak();
if (buffer.lastIndexOf("\n\n") == buffer.length() - 2) {
return;
}
addLineBreak();
}
}