/*
* (c) Copyright 2010-2011 AgileBirds
*
* This file is part of OpenFlexo.
*
* OpenFlexo is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OpenFlexo is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenFlexo. If not, see <http://www.gnu.org/licenses/>.
*
*/
package org.openflexo.toolbox;
import java.awt.Color;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.text.DecimalFormat;
import java.text.DecimalFormatSymbols;
import java.text.ParsePosition;
import java.util.Iterator;
import java.util.regex.Pattern;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;
import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.JDOMException;
import org.jdom2.filter.ElementFilter;
import org.jdom2.input.SAXBuilder;
public class HTMLUtils {
public static final String LATEX_BACKSLASH = "\\textbackslash";
private static final String START_HTML_TAG = "<HTML>";
private static final String END_HTML_TAG = "</HTML>";
private static final String START_HEAD_TAG = "<HEAD>";
private static final String END_HEAD_TAG = "</HEAD>";
private static final String START_BODY_TAG = "<BODY>";
private static final String END_BODY_TAG = "</BODY>";
private static final String START_BOLD_TAG = "<B>";
private static final String END_BOLD_TAG = "</B>";
private static final String START_ITALIC_TAG = "<I>";
private static final String END_ITALIC_TAG = "</I>";
private static final String START_UNDERLINE_TAG = "<U>";
private static final String END_UNDERLINE_TAG = "</U>";
private static final String START_UNORDERED_TAG = "<UL>";
private static final String END_UNORDERED_TAG = "</UL>";
private static final String START_ORDERED_TAG = "<OL>";
private static final String END_ORDERED_TAG = "</OL>";
private static final String START_LIST_ITEM_TAG = "<LI>";
private static final String END_LIST_ITEM_TAG = "</LI>";
private static final String BREAK = "<BR>";
private static final String XHTML_BREAK = "<BR/>";
private static final String START_PARAGRAPH_TAG = "<P>";
private static final String END_PARAGRAPH_TAG = "</P>";
private static final String SMALLER = "<";
private static final String GREATER = ">";
private static final String AMPERSAND = "&";
private static final String QUOTE = """;
private static final String a_GRAVE = "à";
private static final String A_GRAVE = "À";
private static final String a_CIRC = "â";
private static final String A_CIRC = "Â";
private static final String a_UML = "ä";
private static final String A_UML = "Ä";
private static final String a_RING = "å";
private static final String A_RING = "Å";
private static final String ae_LIGATURE = "æ";
private static final String AE_LIGATURE = "Æ";
private static final String c_CEDILLA = "ç";
private static final String C_CEDILLA = "Ç";
private static final String e_ACUTE = "é";
private static final String E_ACUTE = "É";
private static final String e_GRAVE = "è";
private static final String E_GRAVE = "È";
private static final String e_CIRC = "ê";
private static final String E_CIRC = "Ê";
private static final String e_UML = "ë";
private static final String E_UML = "Ë";
private static final String i_UML = "ï";
private static final String I_UML = "Ï";
private static final String o_CIRC = "ô";
private static final String O_CIRC = "Ô";
private static final String o_UML = "ö";
private static final String O_UML = "Ö";
private static final String u_GRAVE = "ù";
private static final String U_GRAVE = "Ù";
private static final String u_CIRC = "û";
private static final String U_CIRC = "Û";
private static final String u_UML = "ü";
private static final String U_UML = "Ü";
private static final String REGISTERED = "®";
private static final String COPYRIGHT = "©";
private static final String EURO = "€";
private static final String NON_BREAKING_SPACE = " ";
private static final String FOOTNOTE_TAG = "footnote";
private static final String EMPTY_PARAGRAPH_REGEXP = "\\s*" + START_PARAGRAPH_TAG + "\\s*" + END_PARAGRAPH_TAG + "\\s*";
private static final Pattern EMPTY_PARAGRAPH_PATTERN = Pattern.compile(EMPTY_PARAGRAPH_REGEXP, Pattern.CASE_INSENSITIVE);
public static String convertHTML2Latex(final String htmlString) {
if (htmlString == null) {
return null;
}
final StringBuilder sb = new StringBuilder();
HTMLEditorKit.ParserCallback callback = new HTMLEditorKit.ParserCallback() {
private boolean withinHead = false;
private boolean withinA = false;
private boolean lastTagWasPar = false;
@Override
public void handleText(char[] data, int pos) {
String convertedData = convertManuallyHTML2Latex(new String(data));
if (lastTagWasPar) {
if (convertedData.trim().length() > 0) {
sb.append("\\par ");
}
lastTagWasPar = false;
}
sb.append(convertedData);
}
@Override
public void handleComment(char[] data, int pos) {
}
@Override
public void handleEndTag(Tag t, int pos) {
if (t == HTML.Tag.HEAD) {
withinHead = false;
return;
}
if (t == HTML.Tag.A) {
if (withinA) {
withinA = false;
sb.append("}");
}
}
if (t == HTML.Tag.B) {
sb.append("}");
} else if (t == HTML.Tag.I) {
sb.append("}");
} else if (t == HTML.Tag.U) {
sb.append("}");
} else if (t == HTML.Tag.P) {
;
} else if (t == HTML.Tag.UL) {
sb.append("\\myitemsep\\end{itemize}\n");
} else if (t == HTML.Tag.OL) {
sb.append("\\myitemsep\\end{enumerate}\n");
} else if (t == HTML.Tag.LI) {
;
} else if (t == HTML.Tag.FONT) {
sb.append("}");
} else {
handleUnknownEndTag(t, pos);
}
}
@Override
public void handleSimpleTag(Tag t, MutableAttributeSet a, int pos) {
if ('/' == htmlString.charAt(pos + 1)) {
handleEndTag(t, pos);
} else {
handleStartTag(t, a, pos);
}
}
@Override
public void handleStartTag(Tag t, MutableAttributeSet a, int pos) {
if (t == HTML.Tag.HTML || t == HTML.Tag.BODY) {
return;
}
if (t == HTML.Tag.HEAD) {
withinHead = true;
return;
}
if (t != HTML.Tag.P) {
lastTagWasPar = false;
}
if (t == HTML.Tag.B) {
sb.append("\\textbf{");
} else if (t == HTML.Tag.I) {
sb.append("\\textit{");
} else if (t == HTML.Tag.A) {
if (a.getAttribute(HTML.Attribute.HREF) != null) {
String href = (String) a.getAttribute(HTML.Attribute.HREF);
if (href.startsWith("#")) {
sb.append("\\hyperlink{" + href.substring(1) + "}{");
} else {
sb.append("\\href{" + href + "}{");
}
withinA = true;
} else if (a.getAttribute(HTML.Attribute.NAME) != null) {
String name = (String) a.getAttribute(HTML.Attribute.NAME);
sb.append("\\hypertarget{" + name + "}{}");
}
} else if (t == HTML.Tag.U) {
sb.append("\\underline{");
} else if (t == HTML.Tag.P) {
sb.append("\n\n");
// lastTagWasPar = true;
} else if (t == HTML.Tag.BR) {
sb.append("\\par ").append(StringUtils.LINE_SEPARATOR);
} else if (t == HTML.Tag.UL) {
sb.append("\\doitemsep\\begin{itemize}").append(StringUtils.LINE_SEPARATOR);
} else if (t == HTML.Tag.OL) {
sb.append("\\doitemsep\\begin{enumerate}").append(StringUtils.LINE_SEPARATOR);
} else if (t == HTML.Tag.LI) {
sb.append("\\item ");
} else if (t == HTML.Tag.FONT) {
String size = (String) a.getAttribute(HTML.Attribute.SIZE);
String latexSize = null;
if (size != null) {
try {
int sizeInt = Integer.parseInt(size);
switch (sizeInt) {
case 1:
latexSize = "{\\footnotesize ";
break;
case 2:
latexSize = "{\\small ";
break;
case 3:
latexSize = "{\\normalsize ";
break;
case 4:
latexSize = "{\\large ";
break;
case 5:
latexSize = "{\\Large ";
break;
case 6:
latexSize = "{\\LARGE ";
break;
case 7:
latexSize = "{\\huge ";
break;
}
if (sizeInt > 7) {
latexSize = "{\\Huge ";
}
} catch (NumberFormatException e) {
// ignore, test on latexSize will cover it
}
}
if (latexSize == null) {
latexSize = "{"; // always put a { because the end tag will close the }
}
sb.append(latexSize);
} else if (t == HTML.Tag.IMG) {
String src = (String) a.getAttribute(HTML.Attribute.SRC);
String width = (String) a.getAttribute(HTML.Attribute.WIDTH);
String height = (String) a.getAttribute(HTML.Attribute.HEIGHT);
if (src != null && !withinHead) {
sb.append("\\includegraphics");
if (height != null || width != null) {
sb.append("[");
if (width != null) {
sb.append("width=");
if (width.indexOf("%") > -1) {
try {
int widthValue = Integer.parseInt(width.substring(0, width.indexOf("%")).trim());
sb.append(widthValue / 100);
} catch (NumberFormatException e) {
e.printStackTrace();
sb.append("1");
}
} else {
sb.append(width.trim()).append("pt");
}
}
if (height != null) {
if (width != null) {
sb.append(',');
}
sb.append("height=");
if (height.indexOf("%") > -1) {
try {
int heightValue = Integer.parseInt(height.substring(0, height.indexOf("%")).trim());
sb.append(heightValue / 100);
} catch (NumberFormatException e) {
e.printStackTrace();
sb.append("1");
}
} else {
sb.append(height.trim()).append("pt");
}
}
sb.append("]");
}
sb.append("{Figures/").append(src).append("}");
} else if (withinHead) {
} else {
System.err.println("WARNING:\tI could not find 'src' attribute");
}
} else {
handleUnknownStartTag(t, pos);
}
}
private void handleUnknownStartTag(Tag t, int pos) {
if (FOOTNOTE_TAG.equals(t.toString())) {
sb.append("\\footnote{");
}
}
private void handleUnknownEndTag(Tag t, int pos) {
if (FOOTNOTE_TAG.equals(t.toString())) {
sb.append("}");
}
}
};
Reader reader = new StringReader(htmlString);
try {
new ParserDelegator().parse(reader, callback, false);
} catch (IOException e1) {
e1.printStackTrace();
}
return LatexUtils.fixQuotesAndLineReturns(sb.toString());
}
/**
*
* @param htmlString
* @return
*/
protected static String convertManuallyHTML2Latex(String htmlString) {
boolean withinHead = false;
StringBuilder sb = new StringBuilder(htmlString.length());
for (int i = 0; i < htmlString.length(); i++) {
char c = htmlString.charAt(i);
switch (c) {
case '~':
sb.append("$\\sim$");
break;
case '{':
case '}':
case '$':
case '%':
case '#':
sb.append("\\").append(c);
break;
case '\\':
sb.append(LATEX_BACKSLASH).append("{}");
break;
case '<':
if (htmlString.regionMatches(true, i, START_HTML_TAG, 0, START_HTML_TAG.length())) {
i += START_HTML_TAG.length();
} else if (htmlString.regionMatches(true, i, END_HTML_TAG, 0, END_HTML_TAG.length())) {
i += END_HTML_TAG.length();
} else if (htmlString.regionMatches(true, i, START_HEAD_TAG, 0, START_HEAD_TAG.length())) {
i += START_HEAD_TAG.length();
withinHead = true;
} else if (htmlString.regionMatches(true, i, END_HEAD_TAG, 0, END_HEAD_TAG.length())) {
i += END_HEAD_TAG.length();
withinHead = false;
} else if (htmlString.regionMatches(true, i, START_BODY_TAG, 0, START_BODY_TAG.length())) {
i += START_BODY_TAG.length();
} else if (htmlString.regionMatches(true, i, END_BODY_TAG, 0, END_BODY_TAG.length())) {
i += END_BODY_TAG.length();
} else if (htmlString.regionMatches(true, i, START_BOLD_TAG, 0, START_BOLD_TAG.length())) {
i += START_BOLD_TAG.length();
if (!withinHead) {
sb.append("\\textbf{");
}
} else if (htmlString.regionMatches(true, i, END_BOLD_TAG, 0, END_BOLD_TAG.length())) {
i += END_BOLD_TAG.length();
if (!withinHead) {
sb.append("}");
}
} else if (htmlString.regionMatches(true, i, START_ITALIC_TAG, 0, START_ITALIC_TAG.length())) {
i += START_ITALIC_TAG.length();
if (!withinHead) {
sb.append("\\textit{");
}
} else if (htmlString.regionMatches(true, i, END_ITALIC_TAG, 0, END_ITALIC_TAG.length())) {
i += END_ITALIC_TAG.length();
if (!withinHead) {
sb.append("}");
}
} else if (htmlString.regionMatches(true, i, START_UNDERLINE_TAG, 0, START_UNDERLINE_TAG.length())) {
i += START_UNDERLINE_TAG.length();
if (!withinHead) {
sb.append("\\underline{");
}
} else if (htmlString.regionMatches(true, i, END_UNDERLINE_TAG, 0, END_UNDERLINE_TAG.length())) {
i += END_UNDERLINE_TAG.length();
if (!withinHead) {
sb.append("}");
}
} else if (htmlString.regionMatches(true, i, START_UNORDERED_TAG, 0, START_UNORDERED_TAG.length())) {
i += START_UNORDERED_TAG.length();
if (!withinHead) {
sb.append("\\begin{itemize}");
}
} else if (htmlString.regionMatches(true, i, END_UNORDERED_TAG, 0, END_UNORDERED_TAG.length())) {
i += END_UNORDERED_TAG.length();
if (!withinHead) {
sb.append("\\end{itemize}");
}
} else if (htmlString.regionMatches(true, i, START_ORDERED_TAG, 0, START_ORDERED_TAG.length())) {
i += START_ORDERED_TAG.length();
if (!withinHead) {
sb.append("\\begin{enumerate}");
}
} else if (htmlString.regionMatches(true, i, END_ORDERED_TAG, 0, END_ORDERED_TAG.length())) {
i += END_ORDERED_TAG.length();
if (!withinHead) {
sb.append("\\end{enumerate}");
}
} else if (htmlString.regionMatches(true, i, START_LIST_ITEM_TAG, 0, START_LIST_ITEM_TAG.length())) {
i += START_LIST_ITEM_TAG.length();
if (!withinHead) {
sb.append("\\item ");
}
} else if (htmlString.regionMatches(true, i, END_LIST_ITEM_TAG, 0, END_LIST_ITEM_TAG.length())) {
i += END_LIST_ITEM_TAG.length();
} else if (htmlString.regionMatches(true, i, BREAK, 0, BREAK.length())) {
i += BREAK.length();
if (!withinHead) {
sb.append("\\\\");
}
} else if (htmlString.regionMatches(true, i, XHTML_BREAK, 0, XHTML_BREAK.length())) {
i += XHTML_BREAK.length();
if (!withinHead) {
sb.append("\\\\");
}
} else if (htmlString.regionMatches(true, i, START_PARAGRAPH_TAG, 0, START_PARAGRAPH_TAG.length())) {
i += START_PARAGRAPH_TAG.length();
if (!withinHead) {
sb.append("\\par ");
}
} else if (htmlString.regionMatches(true, i, END_PARAGRAPH_TAG, 0, END_PARAGRAPH_TAG.length())) {
i += END_PARAGRAPH_TAG.length();
} else if (htmlString.regionMatches(true, i, "IMG", 0, "IMG".length())) {
int j = i + 1;
for (; j < htmlString.length(); j++) {
if (htmlString.charAt(j) == '>') {
break;
}
}
String img = htmlString.substring(i, j + 1);
String src = extractImageSource(img);
String width = extractImageWidth(img);
String height = extractImageHeight(img);
if (src != null && !withinHead) {
sb.append("\\includegraphics");
if (height != null || width != null) {
sb.append("[");
if (width != null) {
sb.append("width=");
if (width.indexOf("%") > -1) {
try {
int widthValue = Integer.parseInt(width.substring(0, width.indexOf("%")).trim());
sb.append(widthValue / 100);
} catch (NumberFormatException e) {
e.printStackTrace();
sb.append("1");
}
} else {
sb.append(width.trim()).append("pt");
}
}
if (height != null) {
if (width != null) {
sb.append(',');
}
sb.append("height=");
if (height.indexOf("%") > -1) {
try {
int heightValue = Integer.parseInt(height.substring(0, height.indexOf("%")).trim());
sb.append(heightValue / 100);
} catch (NumberFormatException e) {
e.printStackTrace();
sb.append("1");
}
} else {
sb.append(height.trim()).append("pt");
}
}
sb.append("]");
}
sb.append("{Figures/").append(src).append("}");
} else if (withinHead) {
} else {
System.err.println("WARNING:\tI could not find 'src' attribute within: " + img);
}
i += j - i;
}
break;
case '&':
if (htmlString.regionMatches(true, i, SMALLER, 0, SMALLER.length())) {
i += SMALLER.length();
if (!withinHead) {
sb.append("\\textsmaller");
}
} else if (htmlString.regionMatches(true, i, GREATER, 0, GREATER.length())) {
i += GREATER.length();
if (!withinHead) {
sb.append("\\textgreater");
}
} else if (htmlString.regionMatches(true, i, AMPERSAND, 0, AMPERSAND.length())) {
i += AMPERSAND.length();
if (!withinHead) {
sb.append("\\&");
}
} else if (htmlString.regionMatches(true, i, QUOTE, 0, QUOTE.length())) {
i += QUOTE.length();
if (!withinHead) {
sb.append("\"");
}
} else if (htmlString.regionMatches(true, i, a_GRAVE, 0, a_GRAVE.length())) {
i += a_GRAVE.length();
if (!withinHead) {
sb.append("à");
}
} else if (htmlString.regionMatches(true, i, A_GRAVE, 0, A_GRAVE.length())) {
i += A_GRAVE.length();
if (!withinHead) {
sb.append("À");
}
} else if (htmlString.regionMatches(true, i, a_CIRC, 0, a_CIRC.length())) {
i += a_CIRC.length();
if (!withinHead) {
sb.append("â");
}
} else if (htmlString.regionMatches(true, i, A_CIRC, 0, A_CIRC.length())) {
i += A_CIRC.length();
if (!withinHead) {
sb.append("Â");
}
} else if (htmlString.regionMatches(true, i, a_UML, 0, a_UML.length())) {
i += a_UML.length();
if (!withinHead) {
sb.append("ä");
}
} else if (htmlString.regionMatches(true, i, A_UML, 0, A_UML.length())) {
i += A_UML.length();
if (!withinHead) {
sb.append("Ä");
}
} else if (htmlString.regionMatches(true, i, a_RING, 0, a_RING.length())) {
i += a_RING.length();
if (!withinHead) {
sb.append("å");
}
} else if (htmlString.regionMatches(true, i, A_RING, 0, A_RING.length())) {
i += A_RING.length();
if (!withinHead) {
sb.append("Å");
}
} else if (htmlString.regionMatches(true, i, ae_LIGATURE, 0, ae_LIGATURE.length())) {
i += ae_LIGATURE.length();
if (!withinHead) {
sb.append("æ");
}
} else if (htmlString.regionMatches(true, i, AE_LIGATURE, 0, AE_LIGATURE.length())) {
i += AE_LIGATURE.length();
if (!withinHead) {
sb.append("Æ");
}
} else if (htmlString.regionMatches(true, i, c_CEDILLA, 0, c_CEDILLA.length())) {
i += c_CEDILLA.length();
if (!withinHead) {
sb.append("ç");
}
} else if (htmlString.regionMatches(true, i, C_CEDILLA, 0, C_CEDILLA.length())) {
i += C_CEDILLA.length();
if (!withinHead) {
sb.append("Ç");
}
} else if (htmlString.regionMatches(true, i, e_ACUTE, 0, e_ACUTE.length())) {
i += e_ACUTE.length();
if (!withinHead) {
sb.append("é");
}
} else if (htmlString.regionMatches(true, i, E_ACUTE, 0, E_ACUTE.length())) {
i += E_ACUTE.length();
if (!withinHead) {
sb.append("É");
}
} else if (htmlString.regionMatches(true, i, e_GRAVE, 0, e_GRAVE.length())) {
i += e_GRAVE.length();
if (!withinHead) {
sb.append("è");
}
} else if (htmlString.regionMatches(true, i, E_GRAVE, 0, E_GRAVE.length())) {
i += E_GRAVE.length();
if (!withinHead) {
sb.append("É");
}
} else if (htmlString.regionMatches(true, i, e_CIRC, 0, e_CIRC.length())) {
i += e_CIRC.length();
if (!withinHead) {
sb.append("ê");
}
} else if (htmlString.regionMatches(true, i, E_CIRC, 0, E_CIRC.length())) {
i += E_CIRC.length();
if (!withinHead) {
sb.append("È");
}
} else if (htmlString.regionMatches(true, i, e_UML, 0, e_UML.length())) {
i += e_UML.length();
if (!withinHead) {
sb.append("ë");
}
} else if (htmlString.regionMatches(true, i, E_UML, 0, E_UML.length())) {
i += E_UML.length();
if (!withinHead) {
sb.append("Ë");
}
} else if (htmlString.regionMatches(true, i, i_UML, 0, i_UML.length())) {
i += i_UML.length();
if (!withinHead) {
sb.append("ë");
}
} else if (htmlString.regionMatches(true, i, I_UML, 0, I_UML.length())) {
i += I_UML.length();
if (!withinHead) {
sb.append("Ë");
}
} else if (htmlString.regionMatches(true, i, o_CIRC, 0, o_CIRC.length())) {
i += o_CIRC.length();
if (!withinHead) {
sb.append("ô");
}
} else if (htmlString.regionMatches(true, i, O_CIRC, 0, O_CIRC.length())) {
i += O_CIRC.length();
if (!withinHead) {
sb.append("Ô");
}
} else if (htmlString.regionMatches(true, i, o_UML, 0, o_UML.length())) {
i += o_UML.length();
if (!withinHead) {
sb.append("ö");
}
} else if (htmlString.regionMatches(true, i, O_UML, 0, O_UML.length())) {
i += O_UML.length();
if (!withinHead) {
sb.append("Ö");
}
} else if (htmlString.regionMatches(true, i, u_GRAVE, 0, u_GRAVE.length())) {
i += u_GRAVE.length();
if (!withinHead) {
sb.append("ù");
}
} else if (htmlString.regionMatches(true, i, U_GRAVE, 0, U_GRAVE.length())) {
i += U_GRAVE.length();
if (!withinHead) {
sb.append("Ù");
}
} else if (htmlString.regionMatches(true, i, u_CIRC, 0, u_CIRC.length())) {
i += u_CIRC.length();
if (!withinHead) {
sb.append("û");
}
} else if (htmlString.regionMatches(true, i, U_CIRC, 0, U_CIRC.length())) {
i += U_CIRC.length();
if (!withinHead) {
sb.append("Ù");
}
} else if (htmlString.regionMatches(true, i, u_UML, 0, u_UML.length())) {
i += u_UML.length();
if (!withinHead) {
sb.append("ü");
}
} else if (htmlString.regionMatches(true, i, U_UML, 0, U_UML.length())) {
i += U_UML.length();
if (!withinHead) {
sb.append("Ü");
}
} else if (htmlString.regionMatches(true, i, REGISTERED, 0, REGISTERED.length())) {
i += REGISTERED.length();
if (!withinHead) {
sb.append("\\textregistered");
}
} else if (htmlString.regionMatches(true, i, COPYRIGHT, 0, COPYRIGHT.length())) {
i += COPYRIGHT.length();
if (!withinHead) {
sb.append("\\copyright");
}
} else if (htmlString.regionMatches(true, i, EURO, 0, EURO.length())) {
i += EURO.length();
if (!withinHead) {
sb.append("€");
}
} else if (htmlString.regionMatches(true, i, NON_BREAKING_SPACE, 0, NON_BREAKING_SPACE.length())) {
i += NON_BREAKING_SPACE.length();
if (!withinHead) {
sb.append("~");
}
} else {
if (!withinHead) {
if (i + 2 < htmlString.length() && htmlString.charAt(i + 1) == '#' && htmlString.indexOf(';', i + 1) > -1) {
int entity = Integer.parseInt(htmlString.substring(i + 2, htmlString.indexOf(';', i + 1)));
if (entity > 127) {
sb.append((char) entity);
} else {
try {
sb.append(new String(new byte[] { (byte) entity }, "ISO-8859-1"));
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
}
i = htmlString.indexOf(';', i + 1);
break;
}
sb.append("\\&");
}
}
break;
default:
if (!withinHead) {
sb.append(c);
}
break;
}
}
return sb.toString();
}
private static String extractImageHeight(String img) {
return extractAttributeNamed(img, "HEIGHT");
}
private static String extractImageWidth(String img) {
return extractAttributeNamed(img, "WIDTH");
}
private static String extractImageSource(String img) {
return extractAttributeNamed(img, "SRC");
}
private static String extractAttributeNamed(String tag, String attribute) {
boolean backslash = false;
boolean withinQuotes = false;
for (int i = 0; i < tag.length(); i++) {
char c = tag.charAt(i);
switch (c) {
case '\\':
backslash = !backslash;
break;
case '"':
if (!backslash) {
withinQuotes = !withinQuotes;
} else {
backslash = false;
}
break;
default:
if (!backslash && !withinQuotes) {
if (tag.regionMatches(true, i, attribute, 0, attribute.length())) {
int j = i + attribute.length();
for (; j < tag.length(); j++) {
if (tag.charAt(j) == ' ' || tag.charAt(j) == '=' || tag.charAt(j) == '\t' || tag.charAt(j) == '\n'
|| tag.charAt(j) == '\r') {
continue;
}
break;
}
StringBuilder src = new StringBuilder();
if (tag.charAt(j) == '"') {
j++;
for (; j < tag.length(); j++) {
if (tag.charAt(j) != '"') {
src.append(tag.charAt(j));
} else {
return src.toString();
}
}
} else {
for (; j < tag.length(); j++) {
if (tag.charAt(j) != ' ' && tag.charAt(j) != '>' && tag.charAt(j) != '\t' && tag.charAt(j) != '\n'
&& tag.charAt(j) != '\r') {
src.append(tag.charAt(j));
} else {
return src.toString();
}
}
}
}
}
}
}
return null;
}
public static String extractBodyContent(String html) {
return extractBodyContent(html, false);
}
public static String extractBodyContent(String html, boolean returnHtmlIfNoBodyFound) {
if (html == null) {
return null;
}
String htmlUpperCase = html.toUpperCase();
int startBodyIndex = htmlUpperCase.indexOf(START_BODY_TAG);
if (startBodyIndex == -1 || html.length() < startBodyIndex + START_BODY_TAG.length() + 1) {
return returnHtmlIfNoBodyFound ? html : null;
}
startBodyIndex = startBodyIndex + START_BODY_TAG.length() + 1;
int endBodyIndex = htmlUpperCase.indexOf(END_BODY_TAG, startBodyIndex);
if (endBodyIndex == -1) {
return html.substring(startBodyIndex);
}
return html.substring(startBodyIndex, endBodyIndex);
}
public static String escapeStringForHTML(String s, boolean removeNewLine) {
if (s == null) {
return null;
}
StringBuffer sb = new StringBuffer();
int n = s.length();
for (int i = 0; i < n; i++) {
char c = s.charAt(i);
switch (c) {
case '<':
sb.append("<");
break;
case '>':
sb.append(">");
break;
case '&':
sb.append("&");
break;
case '"':
sb.append(""");
break;
case '\'':
sb.append("");
break;
case '\n':
if (!removeNewLine) {
sb.append("<br/>");
} else {
sb.append(' ');
}
break;
case '\r':
break;
case 'à':
sb.append("à");
break;
case 'À':
sb.append("À");
break;
case 'â':
sb.append("â");
break;
case 'Â':
sb.append("Â");
break;
case 'ä':
sb.append("ä");
break;
case 'Ä':
sb.append("Ä");
break;
case 'å':
sb.append("å");
break;
case 'Å':
sb.append("Å");
break;
case 'æ':
sb.append("æ");
break;
case 'Æ':
sb.append("Æ");
break;
case 'ç':
sb.append("ç");
break;
case 'Ç':
sb.append("Ç");
break;
case 'é':
sb.append("é");
break;
case 'É':
sb.append("É");
break;
case 'è':
sb.append("è");
break;
case 'È':
sb.append("È");
break;
case 'ê':
sb.append("ê");
break;
case 'Ê':
sb.append("Ê");
break;
case 'ë':
sb.append("ë");
break;
case 'Ë':
sb.append("Ë");
break;
case 'ï':
sb.append("ï");
break;
case 'Ï':
sb.append("Ï");
break;
case 'ô':
sb.append("ô");
break;
case 'Ô':
sb.append("Ô");
break;
case 'ö':
sb.append("ö");
break;
case 'Ö':
sb.append("Ö");
break;
case 'ø':
sb.append("ø");
break;
case 'Ø':
sb.append("Ø");
break;
case 'ß':
sb.append("ß");
break;
case 'ù':
sb.append("ù");
break;
case 'Ù':
sb.append("Ù");
break;
case 'û':
sb.append("û");
break;
case 'Û':
sb.append("Û");
break;
case 'ü':
sb.append("ü");
break;
case 'Ü':
sb.append("Ü");
break;
case '®':
sb.append("®");
break;
case '©':
sb.append("©");
break;
case '€':
sb.append("€");
break;
default:
sb.append(c);
break;
}
}
return sb.toString();
}
/**
* Return a new string containing plain text defined in supplied HTML text. All HTML tags will be removed, but contents of them are kept
* in the returned string
*
* @param s
* @param removeNewLine
* @return
*/
public static String convertHTMLToPlainText(String s, boolean removeNewLine) {
if (s == null) {
return null;
}
StringBuffer sb = new StringBuffer();
int n = s.length();
boolean keepText = true;
for (int i = 0; i < n; i++) {
char c = s.charAt(i);
switch (c) {
case '<':
keepText = false;
break;
case '>':
keepText = true;
break;
case '\n':
if (!removeNewLine) {
sb.append('\n');
} else {
sb.append(' ');
}
break;
default:
if (keepText) {
sb.append(c);
}
break;
}
}
return sb.toString();
}
public static boolean isEmtpyParagraph(String html) {
return html != null && EMPTY_PARAGRAPH_PATTERN.matcher(html).matches();
}
public static void main(String[] args) {
System.err.println(toHexString(Color.WHITE));
System.err.println(toHexString(Color.BLACK));
System.err.println(toHexString(new Color(1, 2, 3)));
}
public enum HTMLColors {
indianred("#cd5c5c"), lightcoral("#f08080"), salmon("#fa8072"), darksalmon("#e9967a"), lightsalmon("#ffa07a"), crimson("#dc143c"), red(
"#ff0000"), firebrick("#b22222"), darkred("#8b0000"), pink("#ffc0cb"), lightpink("#ffb6c1"), hotpink("#ff69b4"), deeppink(
"#ff1493"), mediumvioletred("#c71585"), palevioletred("#db7093"), coral("#ff7f50"), tomato("#ff6347"), orangered("#ff4500"), darkorange(
"#ff8c00"), orange("#ffa500"), gold("#ffd700"), yellow("#ffff00"), lightyellow("#ffffe0"), lemonchiffon("#fffacd"), lightgoldenrodyellow(
"#fafad2"), papayawhip("#ffefd5"), moccasin("#ffe4b5"), peachpuff("#ffdab9"), palegoldenrod("#eee8aa"), khaki("#f0e68c"), darkkhaki(
"#bdb76b"), lavender("#e6e6fa"), thistle("#d8bfd8"), plum("#dda0dd"), violet("#ee82ee"), orchid("#da70d6"), fuchsia(
"#ff00ff"), magenta("#ff00ff"), mediumorchid("#ba55d3"), mediumpurple("#9370db"), amethyst("#9966cc"), blueviolet("#8a2be2"), darkviolet(
"#9400d3"), darkorchid("#9932cc"), darkmagenta("#8b008b"), purple("#800080"), indigo("#4b0082"), slateblue("#6a5acd"), darkslateblue(
"#483d8b"), mediumslateblue("#7b68ee"), greenyellow("#adff2f"), chartreuse("#7fff00"), lawngreen("#7cfc00"), lime("#00ff00"), limegreen(
"#32cd32"), palegreen("#98fb98"), lightgreen("#90ee90"), mediumspringgreen("#00fa9a"), springgreen("#00ff7f"), mediumseagreen(
"#3cb371"), seagreen("#2e8b57"), forestgreen("#228b22"), green("#008000"), darkgreen("#006400"), yellowgreen("#9acd32"), olivedrab(
"#6b8e23"), olive("#808000"), darkolivegreen("#556b2f"), mediumaquamarine("#66cdaa"), darkseagreen("#8fbc8f"), lightseagreen(
"#20b2aa"), darkcyan("#008b8b"), teal("#008080"), aqua("#00ffff"), cyan("#00ffff"), lightcyan("#e0ffff"), paleturquoise(
"#afeeee"), aquamarine("#7fffd4"), turquoise("#40e0d0"), mediumturquoise("#48d1cc"), darkturquoise("#00ced1"), cadetblue(
"#5f9ea0"), steelblue("#4682b4"), lightsteelblue("#b0c4de"), powderblue("#b0e0e6"), lightblue("#add8e6"), skyblue("#87ceeb"), lightskyblue(
"#87cefa"), deepskyblue("#00bfff"), dodgerblue("#1e90ff"), cornflowerblue("#6495ed"), royalblue("#4169e1"), blue("#0000ff"), mediumblue(
"#0000cd"), darkblue("#00008b"), navy("#000080"), midnightblue("#191970"), cornsilk("#fff8dc"), blanchedalmond("#ffebcd"), bisque(
"#ffe4c4"), navajowhite("#ffdead"), wheat("#f5deb3"), burlywood("#deb887"), tan("#d2b48c"), rosybrown("#bc8f8f"), sandybrown(
"#f4a460"), goldenrod("#daa520"), darkgoldenrod("#b8860b"), peru("#cd853f"), chocolate("#d2691e"), saddlebrown("#8b4513"), sienna(
"#a0522d"), brown("#a52a2a"), maroon("#800000"), white("#ffffff"), snow("#fffafa"), honeydew("#f0fff0"), mintcream(
"#f5fffa"), azure("#f0ffff"), aliceblue("#f0f8ff"), ghostwhite("#f8f8ff"), whitesmoke("#f5f5f5"), seashell("#fff5ee"), beige(
"#f5f5dc"), oldlace("#fdf5e6"), floralwhite("#fffaf0"), ivory("#fffff0"), antiquewhite("#faebd7"), linen("#faf0e6"), lavenderblush(
"#fff0f5"), mistyrose("#ffe4e1"), gainsboro("#dcdcdc"), lightgrey("#d3d3d3"), silver("#c0c0c0"), darkgray("#a9a9a9"), gray(
"#808080"), dimgray("#696969"), lightslategray("#778899"), slategray("#708090"), darkslategray("#2f4f4f"), black("#000000");
private String hexValue;
private HTMLColors(String hexValue) {
this.hexValue = hexValue;
}
public String getHexValue() {
return hexValue;
}
public Color getColor() {
return extractColorFromHexValue(getHexValue().substring(1));
}
}
public static Color extractColorFromString(String color) {
color = color.trim();
try {
if (color.startsWith("#")) {
return extractColorFromHexValue(color.substring(1));
} else if (color.toLowerCase().startsWith("rgb(") && color.indexOf(')') > -1) {
color = color.substring(4, color.indexOf(')'));
String[] rgb = color.split(",");
if (rgb.length == 3) {
if (color.indexOf('%') > -1) {
return new Color(Float.valueOf(rgb[0]) / 100, Float.valueOf(rgb[1]) / 100, Float.valueOf(rgb[2]) / 100);
} else {
// Need to trim integers but not floats
return new Color(Integer.parseInt(rgb[0].trim()), Integer.parseInt(rgb[1].trim()), Integer.parseInt(rgb[2].trim()));
}
}
} else {
Color returned = extractColorFromHexValue(color);
if (returned != null) {
return returned;
}
try {
return HTMLColors.valueOf(color.toLowerCase()).getColor();
} catch (IllegalArgumentException e) {
// Not an Html color
}
}
} catch (NumberFormatException e) {
e.printStackTrace();
} catch (RuntimeException e) {
e.printStackTrace();
}
System.err.println("String color '" + color + "' is not a valid string color");
return null;
}
private static Color extractColorFromHexValue(String color) {
if (color.length() == 3) {
color = String.valueOf(color.charAt(0)) + color.charAt(0) + color.charAt(1) + color.charAt(1) + color.charAt(2)
+ color.charAt(2);
}
if (color.length() == 6 && color.matches("[0-9A-Fa-f]+")) {
return new Color(Integer.parseInt(color.substring(0, 2), 16), Integer.parseInt(color.substring(2, 4), 16), Integer.parseInt(
color.substring(4, 6), 16));
}
return null;
}
public static String toHexString(Color color) {
return String.format("%1$02X%2$02X%3$02X", color.getRed(), color.getGreen(), color.getBlue());
}
public static String extractSourceFromEmbeddedTag(String htmlCode) {
if (htmlCode == null || htmlCode.length() < 7) {
return null;
}
if (!htmlCode.substring(0, 7).toLowerCase().startsWith("<html>")) {
htmlCode = "<html>" + htmlCode + "</html>";
}
// 1. Let's try with XML parsers (it works most of the time and it is a lot more reliable as a parser)
final String embeddedVideoCode = htmlCode;
Reader reader = new StringReader(embeddedVideoCode.replaceAll("&", "&"));
try {
SAXBuilder builder = new SAXBuilder();
Document document = builder.build(reader);
Iterator objectIterator = document.getDescendants(new ElementFilter("object"));
while (objectIterator.hasNext()) {
Element e = (Element) objectIterator.next();
for (Object param : e.getChildren("param")) {
String paramName = ((Element) param).getAttributeValue("name");
if (paramName != null && paramName.equals("movie")) {
return ((Element) param).getAttributeValue("value");
}
}
}
Iterator embedIterator = document.getDescendants(new ElementFilter("embed"));
while (embedIterator.hasNext()) {
Element e = (Element) embedIterator.next();
if (e.getAttributeValue("src") != null) {
return e.getAttributeValue("src");
}
}
} catch (JDOMException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
// 2. Ok XML parsers failed, let's see HTML ones
final StringBuilder sb = new StringBuilder();
HTMLEditorKit.ParserCallback callback = new HTMLEditorKit.ParserCallback() {
@Override
public void handleStartTag(Tag t, MutableAttributeSet a, int pos) {
if (sb.length() > 0) {
return;
}
if (t == Tag.OBJECT) {
int indexOfParamMovie = embeddedVideoCode.indexOf("<param name=\"movie\"", pos);
if (indexOfParamMovie > -1) {
int indexOfMovieValue = embeddedVideoCode.indexOf("value=\"", indexOfParamMovie);
if (indexOfMovieValue > -1) {
int endIndexOfMovieValue = embeddedVideoCode.indexOf('"', indexOfMovieValue + 7);
if (endIndexOfMovieValue > -1) {
sb.append(embeddedVideoCode.substring(indexOfMovieValue + 7, endIndexOfMovieValue));
}
}
}
}
}
};
reader = new StringReader(embeddedVideoCode);
try {
new ParserDelegator().parse(reader, callback, false);
} catch (IOException e1) {
e1.printStackTrace();
}
if (sb.length() > 0) {
return sb.toString();
}
// 3. Last resort: Manual parsing
int indexOfEmbed = embeddedVideoCode.indexOf("<embed");
if (indexOfEmbed > -1) {
int indexOfSrc = embeddedVideoCode.indexOf("src=\"", indexOfEmbed);
if (indexOfSrc > -1) {
int endIndexOfSrc = embeddedVideoCode.indexOf('"', indexOfSrc + 5);
if (endIndexOfSrc > -1) {
return embeddedVideoCode.substring(indexOfSrc + 5, endIndexOfSrc);
}
}
}
return null;
}
public static Integer getFontSizeInPoints(String fontSizeWithUnit) {
fontSizeWithUnit = fontSizeWithUnit.trim();
DecimalFormat formatter = new DecimalFormat();
DecimalFormatSymbols formatterSymbol = new DecimalFormatSymbols();
formatterSymbol.setDecimalSeparator('.');
formatter.setDecimalFormatSymbols(formatterSymbol);
ParsePosition position = new ParsePosition(0);
Number size = formatter.parse(fontSizeWithUnit, position);
if (size == null) {
return null;
}
String unit = "px";
if (position.getIndex() < fontSizeWithUnit.length()) {
unit = fontSizeWithUnit.substring(position.getIndex()).trim().toLowerCase();
}
if ("px".equals(unit)) {
return new Double(size.doubleValue() * (92 / 72)).intValue(); // Round to transform px to points, 92 dpi usually, 1 inch = 72
}
// points
if ("pt".equals(unit)) {
return new Double(size.doubleValue()).intValue();
}
// Don't handle % or em
return null;
}
public static Integer getFontSizeInPointsFromFontValue(String fontSizeString) {
fontSizeString = fontSizeString.trim();
try {
int fontSize = Integer.parseInt(fontSizeString);
return getFontSizeInPointsFromFontValue(fontSize);
} catch (NumberFormatException e) {
// Ok not a number, lets return null
}
return null;
}
public static Integer getFontSizeInPointsFromFontValue(int fontSize) {
switch (fontSize) {
case 1:
return 8;
case 2:
return 10; // Default
case 3:
return 12;
case 4:
return 14;
case 5:
return 18;
case 6:
return 24;
case 7:
return 36;
default:
return 36 + fontSize;
}
}
public static int getFontValueFromFontSizeInPoints(int fontSizeInPoints) {
if (fontSizeInPoints <= 8) {
return 1;
}
if (fontSizeInPoints <= 11) {
return 2;
}
if (fontSizeInPoints <= 13) {
return 3;
}
if (fontSizeInPoints <= 16) {
return 4;
}
if (fontSizeInPoints <= 21) {
return 5;
}
if (fontSizeInPoints <= 30) {
return 6;
}
return 7;
}
}