package org.stfm.texdoclet; import java.awt.Color; import java.awt.Image; import java.awt.Toolkit; import java.awt.image.MemoryImageSource; import java.awt.image.PixelGrabber; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.nio.charset.Charset; import java.text.NumberFormat; import java.text.ParseException; import java.util.Hashtable; import java.util.Scanner; import java.util.Stack; import javax.swing.ImageIcon; import javax.swing.text.MutableAttributeSet; import javax.swing.text.html.HTML; import javax.swing.text.html.HTMLEditorKit; import javax.swing.text.html.parser.ParserDelegator; import com.github.rjeschke.txtmark.Processor; import com.keypoint.PngEncoder; /** * <p> * This class implements a <CODE>ParserCallback</CODE> that translates HTML to * the corresponding <TEX txt="\LaTeX{}">LaTeX</TEX>. Not all tags a processed * but the most common are. * <p> * HTML links to files located in the doc-files directory (<a * href="doc-files/appendix_a.html">appendix_a.html</a>, <a * href="doc-files/appendix_b.txt">appendix_b.txt</a>) are transformed to * references to the appendix, whereby the referenced files itself are included * in the appendix. * * @see javax.swing.text.html.parser.ParserDelegator * @author Soeren Caspersen */ public class HTMLtoLaTeXBackEnd extends HTMLEditorKit.ParserCallback { private static final String MARKDOWN1 = "md"; private static final String MARKDOWN2 = "markdown"; private static final String IMAGES_DIR = "texdoclet_images"; /** * Buffer containing the translated HTML. */ StringBuffer ret; Stack<TableInfo> tblstk = new Stack<TableInfo>(); TableInfo tblinfo; int verbat = 0; int colIdx = 0; Hashtable<String, String> colors = new Hashtable<String, String>(10); String block = ""; String refurl = null; String doPrintURL = null; String refname = null; String refimg = null; boolean notex = false; int imageindex = 0; boolean inPreMarkdown = false; /** * Constructs a new instance. * * @param stringBuffer * The <CODE>StringBuffer</CODE> where the translated HTML is * appended. */ public HTMLtoLaTeXBackEnd(StringBuffer stringBuffer) { this.ret = stringBuffer; } /** * This method handles simple HTML tags (e.g. <CODE><HR></CODE>-tags). * It is called by the parser whenever such a tag is encountered. */ @Override public void handleSimpleTag(HTML.Tag tag, MutableAttributeSet attrSet, int pos) { if (tag.toString().equalsIgnoreCase("tex")) { if (attrSet.containsAttribute(HTML.Attribute.ENDTAG, "true")) { notex = false; } else { String tex = (String) attrSet.getAttribute("txt"); ret.append(tex); notex = true; } } else if (notex) { return; } else if (tag == HTML.Tag.META) { } else if (tag == HTML.Tag.HR) { String sz = (String) attrSet.getAttribute(HTML.Attribute.SIZE); int size = 1; if (sz != null) { size = Integer.parseInt(sz); } ret.append("\\mbox{}\\newline\\rule[2mm]{\\hsize}{" + (1 * size * .5) + "mm}\\newline\n"); } else if (tag == HTML.Tag.BR) { ret.append("\\mbox{}\\newline "); } else if (tag == HTML.Tag.IMG) { String refimg = (String) attrSet.getAttribute(HTML.Attribute.SRC); if (refimg.indexOf("://") != -1) { // if (refimg.indexOf("http://") == 0) { // make link ret.append("(see image at " + fixText("<a href=\"" + refimg + "\">" + refimg + "</a>") + ")"); // } else { // skip it // } } else { new File(IMAGES_DIR).mkdir(); double scale = 1.0; File imgF = new File(TeXDoclet.packageDir, refimg); if (!imgF.exists()) { ret.append("(image file not found)"); return; } String imgfile = new File(TeXDoclet.packageDir, refimg) .getAbsolutePath(); ImageIcon icn = new ImageIcon(imgfile); int width = icn.getIconWidth(); int height = icn.getIconHeight(); String sw = (String) attrSet.getAttribute(HTML.Attribute.WIDTH); String sh = (String) attrSet .getAttribute(HTML.Attribute.HEIGHT); try { if (sw != null) { scale = NumberFormat.getPercentInstance().parse(sw) .doubleValue(); } else if (sh != null) { scale = NumberFormat.getPercentInstance().parse(sh) .doubleValue(); } } catch (ParseException er) { er.printStackTrace(); } Image img = icn.getImage(); PixelGrabber pg = new PixelGrabber(img, 0, 0, width, height, true); try { pg.grabPixels(); } catch (InterruptedException e) { throw new RuntimeException( "interrupted waiting for pixels!"); } int[] pixels = (int[]) pg.getPixels(); img = Toolkit.getDefaultToolkit().createImage( new MemoryImageSource(width, height, pixels, 0, width)); byte[] pngbytes; PngEncoder png = new PngEncoder(img, true); String filnavn = IMAGES_DIR + "/pngimage" + imageindex++ + ".png"; try { FileOutputStream outfile = new FileOutputStream(filnavn); pngbytes = png.pngEncode(); if (pngbytes != null) { outfile.write(pngbytes); } outfile.flush(); outfile.close(); } catch (IOException e) { e.printStackTrace(); } if (width * scale <= 800) { width *= scale * 0.5; height *= scale * 0.5; } else { scale = width * scale / 800; width *= 1.0 / scale * 0.5; height *= 1.0 / scale * 0.5; } String fs = System.getProperty("file.separator"); String filnavnFinal = (TeXDoclet.imagesPath == null ? "" : TeXDoclet.imagesPath + (TeXDoclet.imagesPath.endsWith(fs) ? "" : fs)) + filnavn; ret.append("\\mbox{\\includegraphics[width=" + width + "pt, height=" + height + "pt]{" + filnavnFinal + "}}"); } } } /** * This method handles HTML tags that mark a beginning (e.g. * <CODE><P></CODE>-tags). It is called by the parser whenever such a * tag is encountered. */ @Override public void handleStartTag(HTML.Tag tag, MutableAttributeSet attrSet, int pos) { if (notex) { return; } else if (tag == HTML.Tag.PRE) { if (attrSet.containsAttribute("format", MARKDOWN1) || attrSet.containsAttribute("format", MARKDOWN2)) { inPreMarkdown = true; } else { ret.append(TeXDoclet.TRUETYPE + "\\small\n\\mbox{}\\newline "); verbat++; } } else if (tag == HTML.Tag.H1) { ret.append("\\chapter*{"); } else if (tag == HTML.Tag.H2) { ret.append("\\section*{"); } else if (tag == HTML.Tag.H3) { ret.append("\\subsection*{"); } else if (tag == HTML.Tag.H4) { ret.append("\\subsubsection*{"); } else if (tag == HTML.Tag.H5) { ret.append("\\subsubsection*{"); } else if (tag == HTML.Tag.H6) { ret.append("\\subsubsection*{"); } else if (tag == HTML.Tag.SUB) { ret.append("$_{"); } else if (tag == HTML.Tag.SUP) { ret.append("$^{"); // } else if (tag == HTML.Tag.HTML) { } else if (tag == HTML.Tag.HEAD) { } else if (tag == HTML.Tag.CENTER) { ret.append("\\makebox[\\hsize]{ "); } else if (tag == HTML.Tag.TITLE) { ret.append("\\chapter{"); } else if (tag == HTML.Tag.FORM) { } else if (tag == HTML.Tag.INPUT) { } else if (tag == HTML.Tag.BODY) { } else if (tag == HTML.Tag.CODE) { ret.append(TeXDoclet.TRUETYPE + "\\small "); } else if (tag == HTML.Tag.TT) { ret.append(TeXDoclet.TRUETYPE + " "); } else if (tag == HTML.Tag.P) { ret.append("\n\n"); } else if (tag == HTML.Tag.B) { ret.append("{\\bf "); } else if (tag == HTML.Tag.STRONG) { ret.append("{\\bf "); } else if (tag == HTML.Tag.A) { refurl = (String) attrSet.getAttribute(HTML.Attribute.HREF); doPrintURL = (String) attrSet.getAttribute("doprinturl"); if (refurl != null) { if (TeXDoclet.hyperref) { if (refurl.toLowerCase().startsWith("doc-files")) { File file = new File(TeXDoclet.packageDir, refurl); if (file.exists()) { if (TeXDoclet.appendencies.contains(file.getPath())) { refurl = TeXDoclet.appendencies.get(file .getPath()); } else { refurl = "appendix" + new Integer( TeXDoclet.appendencies.size() + 1); TeXDoclet.appendencies.put(file.getPath(), refurl); } ret.append("{"); return; } } else { String sharp = ""; if (refurl.indexOf("#") >= 0) { sharp = refurl.substring(refurl.indexOf("#") + 1, refurl.length()); if (sharp.indexOf("%") >= 0) { sharp = ""; // Don't know what to do with '%' } refurl = refurl.substring(0, refurl.indexOf("#")); } ret.append("\\hyperref{" + refurl + "}{" + sharp + "}{}{"); // ret.append("\\href{" + refurl + "}{"); } } else { ret.append("{\\bf "); } } else { refname = (String) attrSet.getAttribute(HTML.Attribute.NAME); if (refname != null && TeXDoclet.hyperref) { ret.append("\\hyperdef{" + refname + "}{"); } } } else if (tag == HTML.Tag.OL) { ret.append("\n\\begin{enumerate}"); } else if (tag == HTML.Tag.DL) { ret.append("\n\\begin{itemize}"); } else if (tag == HTML.Tag.LI) { ret.append("\n\\item{\\vskip -.8ex "); } else if (tag == HTML.Tag.DT) { ret.append("\\item["); } else if (tag == HTML.Tag.DD) { ret.append("{"); } else if (tag == HTML.Tag.UL) { ret.append("\\begin{itemize}"); } else if (tag == HTML.Tag.I) { ret.append(TeXDoclet.ITALIC + " "); } else if (tag == HTML.Tag.EM) { ret.append(TeXDoclet.ITALIC + " "); } else if (tag == HTML.Tag.TABLE) { tblstk.push(tblinfo); tblinfo = new TableInfo(); ret = tblinfo.startTable(ret, attrSet); } else if (tag == HTML.Tag.TH) { tblinfo.startHeadCol(attrSet); } else if (tag == HTML.Tag.TD) { tblinfo.startCol(attrSet); } else if (tag == HTML.Tag.TR) { tblinfo.startRow(attrSet); } else if (tag == HTML.Tag.FONT) { // String sz = (String) attrSet.getAttribute(HTML.Attribute.SIZE); String col = (String) attrSet.getAttribute(HTML.Attribute.COLOR); ret.append("{"); if (col != null) { if ("redgreenbluewhiteyellowblackcyanmagenta".indexOf(col) != -1) { ret.append("\\color{" + col + "}"); } else { if ("abcdefABCDEF0123456789".indexOf(col.charAt(0)) != -1) { Color cc = new Color((int) Long.parseLong(col, 16)); String name = colors.get("color" + cc.getRGB()); if (name == null) { ret.append("\\definecolor{color" + colIdx + "}[rgb]{" + (cc.getRed() / 255.0) + "," + (cc.getBlue() / 255.0) + "," + (cc.getGreen() / 255.0) + "}"); name = "color" + colIdx; colIdx++; colors.put("color" + cc.getRGB(), name); } ret.append("\\color{" + name + "}"); ++colIdx; } } } } } /** * This method handles HTML tags that mark an ending (e.g. * <CODE></P></CODE>-tags). It is called by the parser whenever such a * tag is encountered. */ @Override public void handleEndTag(HTML.Tag tag, int pos) { if (notex) { return; } else if (tag == HTML.Tag.PRE) { if (!inPreMarkdown) { verbat--; ret.append("}\n"); } else { inPreMarkdown = false; } } else if (tag == HTML.Tag.H1) { ret.append("}"); } else if (tag == HTML.Tag.H2) { ret.append("}"); } else if (tag == HTML.Tag.H3) { ret.append("}"); } else if (tag == HTML.Tag.H4) { ret.append("}"); } else if (tag == HTML.Tag.H5) { ret.append("}"); } else if (tag == HTML.Tag.H6) { ret.append("}"); } else if (tag == HTML.Tag.SUB) { ret.append("}$"); } else if (tag == HTML.Tag.SUP) { ret.append("}$"); // } else if (tag == HTML.Tag.HTML) { } else if (tag == HTML.Tag.HEAD) { } else if (tag == HTML.Tag.CENTER) { ret.append("}"); } else if (tag == HTML.Tag.TITLE) { ret.append("}{"); } else if (tag == HTML.Tag.FORM) { } else if (tag == HTML.Tag.INPUT) { } else if (tag == HTML.Tag.BODY) { } else if (tag == HTML.Tag.CODE) { ret.append("}"); } else if (tag == HTML.Tag.TT) { ret.append("}"); } else if (tag == HTML.Tag.P) { } else if (tag == HTML.Tag.B) { ret.append("}"); } else if (tag == HTML.Tag.STRONG) { ret.append("}"); } else if (tag == HTML.Tag.A) { if (refurl != null) { if (refurl.startsWith("appendix")) { ret.append("\\refdefined{" + refurl + "}"); ret.append("}"); return; } ret.append("}"); if (doPrintURL != null) { if (!refurl.equals("")) { ret.append("(at "); ret.append(fixText(refurl)); ret.append(")"); } } } else if (refname != null) { ret.append("}"); } } else if (tag == HTML.Tag.LI) { ret.append("}"); } else if (tag == HTML.Tag.DT) { ret.append("]"); } else if (tag == HTML.Tag.DD) { ret.append("}"); } else if (tag == HTML.Tag.DL) {// / ret.append("\n\\end{itemize}\n"); } else if (tag == HTML.Tag.OL) { ret.append("\n\\end{enumerate}\n"); } else if (tag == HTML.Tag.UL) { ret.append("\n\\end{itemize}\n"); } else if (tag == HTML.Tag.I) { ret.append("}"); } else if (tag == HTML.Tag.EM) { ret.append("}"); } else if (tag == HTML.Tag.TABLE) { ret = tblinfo.endTable(); tblinfo = tblstk.pop(); } else if (tag == HTML.Tag.TH) { tblinfo.endCol(); } else if (tag == HTML.Tag.TD) { tblinfo.endCol(); } else if (tag == HTML.Tag.TR) { tblinfo.endRow(); } else if (tag == HTML.Tag.FONT) { ret.append("}"); } } /** * This method handles all other text. */ @Override public void handleText(char[] data, int pos) { String str = new String(data); if (inPreMarkdown) { String html = ""; // usually java documentation has a leading space character in each // line // that is to remove for markdown processing ! if (str.startsWith(" ")) { str = removeLeadingSpaces(str); } // test some Markdown processors here : // 1. MarkdownJ // MarkdownProcessor m = new MarkdownProcessor(); // html = m.markdown(str); // 2. PegDown // PegDownProcessor pp = new PegDownProcessor(); // html = pp.markdownToHtml(str); // 3. MarkdownPapers // Markdown md = new Markdown(); // StringWriter sw = new StringWriter(); // try { // md.transform(new StringReader(str), sw); // } catch (org.tautua.markdownpapers.parser.ParseException e) { // e.printStackTrace(); // } // html = sw.toString(); // 4. Txtmark html = Processor.process(str); String toAppend = HTMLtoLaTeXBackEnd.fixText(html); ret.append(toAppend); return; } for (int i = 0; i < str.length(); ++i) { int c = str.charAt(i); if (notex) { continue; } switch (c) { case 160: //   ret.append("\\phantom{ }"); break; case ' ': if (verbat > 0) { ret.append("\\phantom{ }"); } else { ret.append(' '); } break; case '[': if (i < str.length() - 1 && str.charAt(i + 1) == ' ') { ret.append("\\lbrack\\ "); i++; } else { ret.append("\\lbrack "); } break; case ']': if (i < str.length() - 1 && str.charAt(i + 1) == ' ') { ret.append("\\rbrack\\ "); i++; } else { ret.append("\\rbrack "); } break; case '_': case '%': case '$': case '#': case '}': case '{': case '&': ret.append('\\'); ret.append((char) c); if (i < str.length() - 1 && str.charAt(i + 1) == ' ') { ret.append("\\ "); i++; } break; // case 0xc38a: case 0xc3a6: if (Charset.defaultCharset().name().equals("UTF-8")) { if (i < str.length() - 1 && str.charAt(i + 1) == ' ') { ret.append("\\ae\\ "); i++; } else { ret.append("\\ae "); } } else { ret.append((char) c); } break; case 0xc386: if (Charset.defaultCharset().name().equals("UTF-8")) { if (i < str.length() - 1 && str.charAt(i + 1) == ' ') { ret.append("\\AE\\ "); i++; } else { ret.append("\\AE "); } } else { ret.append((char) c); } break; // case 0xc382: case 0xc3a5: if (Charset.defaultCharset().name().equals("UTF-8")) { if (i < str.length() - 1 && str.charAt(i + 1) == ' ') { ret.append("\\aa\\ "); i++; } else { ret.append("\\aa "); } } else { ret.append((char) c); } break; case 0xc385: if (Charset.defaultCharset().name().equals("UTF-8")) { if (i < str.length() - 1 && str.charAt(i + 1) == ' ') { ret.append("\\AA\\ "); i++; } else { ret.append("\\AA "); } } else { ret.append((char) c); } break; // case 0xc2af: case 0xc3b8: if (Charset.defaultCharset().name().equals("UTF-8")) { if (i < str.length() - 1 && str.charAt(i + 1) == ' ') { ret.append("\\o\\ "); i++; } else { ret.append("\\o "); } } else { ret.append((char) c); } break; // case 0xc3bf: case 0xc398: if (Charset.defaultCharset().name().equals("UTF-8")) { if (i < str.length() - 1 && str.charAt(i + 1) == ' ') { ret.append("\\O\\ "); i++; } else { ret.append("\\O "); } } else { ret.append((char) c); } break; case '^': ret.append("$\\wedge$"); break; case '<': ret.append("\\textless "); break; case '\r': case '\n': if (tblstk.size() > 0) { // Swallow new lines while tables are in progress, // <tr> controls new line emission. if (verbat > 0) { ret.append("}\\mbox{}\\newline\n" + TeXDoclet.TRUETYPE + "\\small "); } else { ret.append(" "); } } else { if (verbat > 0) { ret.append("}\\mbox{}\\newline\n" + TeXDoclet.TRUETYPE + "\\small "); } else if ((i + 1) < str.length() && str.charAt(i + 1) == 10) { ret.append("\\bl "); ++i; } else { ret.append((char) c); } } break; case '/': ret.append("/"); break; case '>': ret.append("\\textgreater "); break; case '\\': ret.append("\\textbackslash "); break; default: ret.append((char) c); break; } } } /** * Converts a HTML string into <TEX txt="\LaTeX{}">LaTeX</TEX> using an * instance of <CODE>HTMLtoLaTeXBackEnd</CODE>. * * @return The converted string. */ public static String fixText(String str) { // System.out.println("fixText: " + str); StringBuffer result = new StringBuffer(str.length()); HTMLtoLaTeXBackEnd b = new HTMLtoLaTeXBackEnd(result); Reader reader = new StringReader(str); try { new ParserDelegator().parse(reader, b, false); } catch (IOException e) { } return new String(result); } private String removeLeadingSpaces(String str) { StringBuffer sb = new StringBuffer(); Scanner scanner = new Scanner(str); while (scanner.hasNextLine()) { String l = scanner.nextLine(); if (l.startsWith(" ")) { sb.append(l.substring(1) + System.getProperty("line.separator")); } else { sb.append(l + System.getProperty("line.separator")); } } return sb.toString(); } }