package com.pnikosis.html2markdown; import com.pnikosis.html2markdown.MDLine.MDLineType; import java.io.File; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.Calendar; import java.util.List; import java.util.Map; import java.util.TreeMap; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Entities.EscapeMode; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; import org.jsoup.parser.Tag; import org.jsoup.safety.Cleaner; import org.jsoup.safety.Whitelist; /** * Convert Html to MarkDown */ public class HTML2Md { private static int indentation = -1; private static boolean orderedList = false; public static String convert(String theHTML, String baseURL) { Document doc = Jsoup.parse(theHTML, baseURL); return parseDocument(doc); } public static String convert(URL url, int timeoutMillis) throws IOException { Document doc = Jsoup.parse(url, timeoutMillis); return parseDocument(doc); } public static String convertHtml(String html, String charset) throws IOException { Document doc = Jsoup.parse(html, charset); return parseDocument(doc); } public static String convertFile(File file, String charset) throws IOException { Document doc = Jsoup.parse(file, charset); return parseDocument(doc); } public static void htmlToJekyllMd(String htmlPath, String mdPath, String charset) { try { List<File> fileList = FilesUtil.getAllFiles(htmlPath, "html"); for (File file : fileList) { String mdName = file.getAbsolutePath().replace(htmlPath, mdPath).replace("html", "md"); String hmPath = mdName.substring(0, mdName.lastIndexOf("/")) + "/"; String separator = System.getProperty("line.separator"); String head = "---" + separator + "layout: post" + separator + "title: \"" + file.getName() + "\"" + separator + "description: \"" + file.getName() + "\"" + separator + "category: pages\"" + separator + "tags: [blog]\"" + separator + "--- " + separator + "{% include JB/setup %}" + separator + separator; FilesUtil.isExist(hmPath); String parsedText = convertFile(file, charset); Calendar calendar = Calendar.getInstance(); String dateName = DateUtil.dateToShortString(calendar.getTime()); String newName = dateName + "-" + hmPath.replace(mdPath, "").replace("/", "-") + "-" + file.getName(); String mmName = (hmPath + newName.replace("html", "md")).replaceAll("\\s*", ""); FilesUtil.newFile(mmName, head + parsedText, charset); } } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } public static void htmlToHexoMd(String htmlPath, String mdPath, String charset) { try { List<File> fileList = FilesUtil.getAllFiles(htmlPath, "html"); for (File file : fileList) { String mdName = file.getAbsolutePath().replace(htmlPath, mdPath).replace("html", "md"); String hmPath = mdName.substring(0, mdName.lastIndexOf("/")) + "/"; String separator = System.getProperty("line.separator"); String[] strings = hmPath.replace(mdPath, "").split("/"); Calendar calendar = Calendar.getInstance(); String dateName = DateUtil.dateToShortString(calendar.getTime()); String dateString = DateUtil.dateToLongString(calendar.getTime()); StringBuilder blog = new StringBuilder(); StringBuilder categories = new StringBuilder(); Map<String, String> stringMap = new TreeMap<String, String>(); for (String value : strings) { stringMap.put(value, value); } for (String tag : stringMap.keySet()) { blog.append(" - ").append(tag).append(separator); } categories.append(strings[0]); String head = "---" + separator + "layout: post" + separator + "title: \"" + file.getName().replace(".html", "").split("-")[0] + "\"" + separator + "date: " + dateString + separator + "categories: " + categories + separator + "tags: " + separator + blog.toString() + "--- " + separator + separator; FilesUtil.isExist(hmPath); String parsedText = HTML2Md.convertFile(file, "utf-8"); String newName = dateName + "-" + hmPath.replace(mdPath, "").replace("/", "-") + "-" + file.getName(); String mmName = (hmPath + newName.replace("html", "md")).replaceAll("\\s*", ""); FilesUtil.newFile(mmName, head + parsedText, charset); } } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } private static String parseDocument(Document dirtyDoc) { indentation = -1; String title = dirtyDoc.title(); Whitelist whitelist = Whitelist.relaxed(); Cleaner cleaner = new Cleaner(whitelist); Document doc = cleaner.clean(dirtyDoc); doc.outputSettings().escapeMode(EscapeMode.xhtml); if (!title.trim().equals("")) { return "# " + title + "\n\n" + getTextContent(doc); } else { return getTextContent(doc); } } private static String getTextContent(Element element) { ArrayList<MDLine> lines = new ArrayList<MDLine>(); List<Node> children = element.childNodes(); for (Node child : children) { if (child instanceof TextNode) { TextNode textNode = (TextNode) child; MDLine line = getLastLine(lines); if (line.getContent().equals("")) { if (!textNode.isBlank()) { line.append(textNode.text().replaceAll("#", "/#").replaceAll("\\*", "/\\*")); } } else { line.append(textNode.text().replaceAll("#", "/#").replaceAll("\\*", "/\\*")); } } else if (child instanceof Element) { Element childElement = (Element) child; processElement(childElement, lines); } else { System.out.println(); } } int blankLines = 0; StringBuilder result = new StringBuilder(); for (int i = 0; i < lines.size(); i++) { String line = lines.get(i).toString().trim(); if (line.equals("")) { blankLines++; } else { blankLines = 0; } if (blankLines < 2) { result.append(line); if (i < lines.size() - 1) { result.append("\n"); } } } return result.toString(); } private static void processElement(Element element, ArrayList<MDLine> lines) { Tag tag = element.tag(); String tagName = tag.getName(); if (tagName.equals("div")) { div(element, lines); } else if (tagName.equals("p")) { p(element, lines); } else if (tagName.equals("br")) { br(lines); } else if (tagName.matches("^h[0-9]+$")) { h(element, lines); } else if (tagName.equals("strong") || tagName.equals("b")) { strong(element, lines); } else if (tagName.equals("em")) { em(element, lines); } else if (tagName.equals("hr")) { hr(lines); } else if (tagName.equals("a")) { a(element, lines); } else if (tagName.equals("img")) { img(element, lines); } else if (tagName.equals("code")) { code(element, lines); } else if (tagName.equals("ul")) { ul(element, lines); } else if (tagName.equals("ol")) { ol(element, lines); } else if (tagName.equals("li")) { li(element, lines); } else { MDLine line = getLastLine(lines); line.append(getTextContent(element)); } } private static MDLine getLastLine(ArrayList<MDLine> lines) { MDLine line; if (lines.size() > 0) { line = lines.get(lines.size() - 1); } else { line = new MDLine(MDLineType.None, 0, ""); lines.add(line); } return line; } private static void div(Element element, ArrayList<MDLine> lines) { MDLine line = getLastLine(lines); String content = getTextContent(element); if (!content.equals("")) { if (!line.getContent().trim().equals("")) { lines.add(new MDLine(MDLineType.None, 0, "")); lines.add(new MDLine(MDLineType.None, 0, content)); lines.add(new MDLine(MDLineType.None, 0, "")); } else { if (!content.trim().equals("")) line.append(content); } } } private static void p(Element element, ArrayList<MDLine> lines) { MDLine line = getLastLine(lines); if (!line.getContent().trim().equals("")) lines.add(new MDLine(MDLineType.None, 0, "")); lines.add(new MDLine(MDLineType.None, 0, "")); lines.add(new MDLine(MDLineType.None, 0, getTextContent(element))); lines.add(new MDLine(MDLineType.None, 0, "")); if (!line.getContent().trim().equals("")) lines.add(new MDLine(MDLineType.None, 0, "")); } private static void br(ArrayList<MDLine> lines) { MDLine line = getLastLine(lines); if (!line.getContent().trim().equals("")) lines.add(new MDLine(MDLineType.None, 0, "")); } private static void h(Element element, ArrayList<MDLine> lines) { MDLine line = getLastLine(lines); if (!line.getContent().trim().equals("")) lines.add(new MDLine(MDLineType.None, 0, "")); int level = Integer.valueOf(element.tagName().substring(1)); switch (level) { case 1: lines.add(new MDLine(MDLineType.Head1, 0, getTextContent(element))); break; case 2: lines.add(new MDLine(MDLineType.Head2, 0, getTextContent(element))); break; default: lines.add(new MDLine(MDLineType.Head3, 0, getTextContent(element))); break; } lines.add(new MDLine(MDLineType.None, 0, "")); lines.add(new MDLine(MDLineType.None, 0, "")); } private static void strong(Element element, ArrayList<MDLine> lines) { MDLine line = getLastLine(lines); line.append("**"); line.append(getTextContent(element)); line.append("**"); } private static void em(Element element, ArrayList<MDLine> lines) { MDLine line = getLastLine(lines); line.append("*"); line.append(getTextContent(element)); line.append("*"); } private static void hr(ArrayList<MDLine> lines) { lines.add(new MDLine(MDLineType.None, 0, "")); lines.add(new MDLine(MDLineType.HR, 0, "")); lines.add(new MDLine(MDLineType.None, 0, "")); } private static void a(Element element, ArrayList<MDLine> lines) { MDLine line = getLastLine(lines); line.append("["); line.append(getTextContent(element)); line.append("]"); line.append("("); String url = element.attr("href"); line.append(url); String title = element.attr("title"); if (!title.equals("")) { line.append(" \""); line.append(title); line.append("\""); } line.append(")"); } private static void img(Element element, ArrayList<MDLine> lines) { MDLine line = getLastLine(lines); line.append("!["); String alt = element.attr("alt"); line.append(alt); line.append("]"); line.append("("); String url = element.attr("src"); line.append(url); String title = element.attr("title"); if (!title.equals("")) { line.append(" \""); line.append(title); line.append("\""); } line.append(")"); } private static void code(Element element, ArrayList<MDLine> lines) { lines.add(new MDLine(MDLineType.None, 0, "")); MDLine line = new MDLine(MDLineType.None, 0, " "); line.append(getTextContent(element).replace("\n", " ")); lines.add(line); lines.add(new MDLine(MDLineType.None, 0, "")); } private static void ul(Element element, ArrayList<MDLine> lines) { lines.add(new MDLine(MDLineType.None, 0, "")); indentation++; orderedList = false; MDLine line = new MDLine(MDLineType.None, 0, ""); line.append(getTextContent(element)); lines.add(line); indentation--; lines.add(new MDLine(MDLineType.None, 0, "")); } private static void ol(Element element, ArrayList<MDLine> lines) { lines.add(new MDLine(MDLineType.None, 0, "")); indentation++; orderedList = true; MDLine line = new MDLine(MDLineType.None, 0, ""); line.append(getTextContent(element)); lines.add(line); indentation--; lines.add(new MDLine(MDLineType.None, 0, "")); } private static void li(Element element, ArrayList<MDLine> lines) { MDLine line; if (orderedList) { line = new MDLine(MDLineType.Ordered, indentation, getTextContent(element)); } else { line = new MDLine(MDLineType.Unordered, indentation, getTextContent(element)); } lines.add(line); } }