/* __ * \ \ * _ _ \ \ ______ * | | | | > \( __ ) * | |_| |/ ^ \| || | * | ._,_/_/ \_\_||_| * | | * |_| * * ---------------------------------------------------------------------------- * "THE BEER-WARE LICENSE" (Revision 42): * <rob ∂ CLABS dot CC> wrote this file. As long as you retain this notice you * can do whatever you want with this stuff. If we meet some day, and you think * this stuff is worth it, you can buy me a beer in return. * ---------------------------------------------------------------------------- */ package com.formulasearchengine.mathosphere.mlp.text; import org.apache.commons.lang3.text.translate.AggregateTranslator; import org.apache.commons.lang3.text.translate.CharSequenceTranslator; import org.apache.commons.lang3.text.translate.EntityArrays; import org.apache.commons.lang3.text.translate.LookupTranslator; import org.eclipse.mylyn.wikitext.core.parser.Attributes; import org.eclipse.mylyn.wikitext.core.parser.builder.NoOpDocumentBuilder; import java.util.Deque; import java.util.LinkedList; import java.util.regex.Pattern; /** * A DocumentBuilder for the mylyn wikitext parser. It converts a document written in * MediaWiki-Markup into plaintext. Most of the structure of the document will be stripped, * including linebreaks, headings, etc. * * @author rob */ public class PlaintextDocumentBuilder extends NoOpDocumentBuilder { private static final char LEFT_DOUBLE_QUOTE = '\u201c'; private static final char RIGHT_DOUBLE_QUOTE = '\u201d'; private StringBuilder writer = new StringBuilder(); /** * These lists store all blocks within a block/span that will not be rendered. */ private Deque<BlockType> skipBlocks = new LinkedList<>(); private Deque<SpanType> skipSpans = new LinkedList<>(); /** * store all spans that will be rendered */ private LinkedList<SpanType> passingSpans = new LinkedList<>(); private String result = ""; @Override public void endDocument() { String doc = WikiTextUtils.subsup(writer.toString()); // remove remaining/undetected templates doc = Pattern.compile("\\{\\{[^\\{]*?\\}\\}").matcher(doc).replaceAll(""); doc = Pattern.compile("\\u2016[^\\u2016]*?\\u2016").matcher(doc).replaceAll(""); // remove dangling lines doc = Pattern.compile("(:?\\A|\\n)\\s*[\\*\\|:].*").matcher(doc).replaceAll(""); doc = Pattern.compile("\\}\\}\\s*").matcher(doc).replaceAll(""); // remove undetected emphasis tags doc = Pattern.compile("'{2,}").matcher(doc).replaceAll(""); // comments doc = Pattern.compile("<!--.*?-->", Pattern.DOTALL).matcher(doc).replaceAll(""); // headings doc = Pattern.compile("([=]{2,4})[^\\n]*?\\1", Pattern.DOTALL).matcher(doc).replaceAll(""); // references doc = Pattern.compile("<references>.*?</references>", Pattern.DOTALL).matcher(doc).replaceAll(""); doc = Pattern.compile("<ref[^>/]*>.*?</ref>", Pattern.DOTALL).matcher(doc).replaceAll(""); doc = Pattern.compile("<ref[^>]*>").matcher(doc).replaceAll(""); doc = Pattern.compile("</ref[^>]*>").matcher(doc).replaceAll(""); // empty/unknown inline tags and non inline tags doc = Pattern.compile("<([^ >]+)[^>]*>(.*?)</\\1>").matcher(doc).replaceAll("$2"); doc = Pattern.compile("<([^ >]+)[^>]*/?>").matcher(doc).replaceAll(" "); // fix for undetected links doc = Pattern.compile("\\[\\[([^\\|]*)|([^\\]]*)]]").matcher(doc).replaceAll("$2"); doc = Pattern.compile("\\[\\[[^\\[\\]]*]]").matcher(doc).replaceAll(""); // strip unneeded linebreaks, etc. doc = Pattern.compile("\\n+").matcher(doc).replaceAll(" "); doc = Pattern.compile("\\s+").matcher(doc).replaceAll(" "); // remove language links doc = Pattern.compile("“[a-z]{2,3}:.*?”").matcher(doc).replaceAll(""); // remove misc quotation symbols doc = Pattern.compile("'|\\\"").matcher(doc).replaceAll(""); // reposition plurals into links doc = Pattern.compile("”(\\w)").matcher(doc).replaceAll("$1”"); // good hackers trim! doc = doc.trim(); this.result = doc; } public String getResult() { return result; } @Override public void beginBlock(BlockType type, Attributes attributes) { switch (type) { // passing blocks case PARAGRAPH: case DEFINITION_ITEM: case DEFINITION_TERM: case NUMERIC_LIST: case DEFINITION_LIST: case BULLETED_LIST: if (skipBlocks.size() > 0) { skipBlocks.add(type); } break; // blocks that will be skipped case TIP: case WARNING: case INFORMATION: case NOTE: case PANEL: case FOOTNOTE: case QUOTE: case CODE: case LIST_ITEM: case TABLE: case TABLE_ROW: case TABLE_CELL_HEADER: case TABLE_CELL_NORMAL: case PREFORMATTED: skipBlocks.add(type); break; default: break; } } @Override public void endBlock() { if (!skipBlocks.isEmpty()) { skipBlocks.removeLast(); } else { writer.append(" "); } } @Override public void beginSpan(SpanType type, Attributes attributes) { switch (type) { // passing spans case EMPHASIS: case ITALIC: case SPAN: case STRONG: case BOLD: case SUBSCRIPT: case SUPERSCRIPT: case UNDERLINED: case CITATION: if (skipSpans.size() > 0) { skipSpans.add(type); } else { passingSpans.add(type); } break; // span that will be skipped case INSERTED: case DELETED: case MONOSPACE: case CODE: skipSpans.add(type); break; default: break; } } @Override public void endSpan() { if (!skipSpans.isEmpty()) { skipSpans.removeLast(); } else { passingSpans.removeLast(); } } @Override public void beginHeading(int level, Attributes attributes) { skipSpans.add(SpanType.SPAN); } @Override public void endHeading() { if (!skipSpans.isEmpty()) { skipSpans.removeLast(); } } @Override public void characters(String text) { if (skipBlocks.size() > 0) { return; } if (skipSpans.size() > 0) { return; } if (!passingSpans.isEmpty()) { SpanType type = passingSpans.getLast(); switch (type) { case SUBSCRIPT: text = "_" + text; break; case SUPERSCRIPT: text = "^" + text; break; default: break; } } writer.append(text); } private static final CharSequenceTranslator TRANSLATOR = new AggregateTranslator( new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE()), new LookupTranslator(EntityArrays.BASIC_UNESCAPE()), new LookupTranslator(EntityArrays.HTML40_EXTENDED_UNESCAPE())); @Override public void entityReference(String entity) { String translatedEntity = TRANSLATOR.translate('&' + entity + ';'); writer.append(translatedEntity); } @Override public void link(Attributes attributes, String link, String text) { if (link.isEmpty() && text.isEmpty()) { return; } // skip if (skipBlocks.size() > 0) { return; } if (skipSpans.size() > 0) { return; } String full = (link + text).toLowerCase(); // special link types if (full.contains("category:")) { return; } if (full.contains("image:")) { return; } if (full.contains("file:")) { return; } if (full.contains("thumb")) { return; } if (full.contains("|")) { return; } // urls, because the parser also detects raw links if (full.matches("https?:")) { return; } // language links if (text.matches("\\w{2}:")) { return; } // when textfield is empty the link will be shown, except anything in // parentheses. if (text.isEmpty()) { text = link.replaceAll("\\(.*?\\)", ""); } writer.append(LEFT_DOUBLE_QUOTE + text + RIGHT_DOUBLE_QUOTE); } @Override public void acronym(String text, String definition) { writer.append(text); } @Override public void lineBreak() { writer.append("\n"); } @Override public void charactersUnescaped(String literal) { writer.append(literal); } }