package io.lumify.wikipedia; import de.fau.cs.osr.ptk.common.AstVisitor; import de.fau.cs.osr.utils.StringUtils; import io.lumify.core.util.LumifyLogger; import io.lumify.core.util.LumifyLoggerFactory; import org.sweble.wikitext.engine.PageTitle; import org.sweble.wikitext.engine.config.WikiConfig; import org.sweble.wikitext.engine.nodes.EngPage; import org.sweble.wikitext.parser.nodes.*; import org.sweble.wikitext.parser.parser.LinkTargetException; import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import java.util.regex.Pattern; public class TextConverter extends AstVisitor<WtNode> { private static final LumifyLogger LOGGER = LumifyLoggerFactory.getLogger(TextConverter.class); private static final Pattern ws = Pattern.compile("\\s+"); private final WikiConfig config; private final int wrapCol; private StringBuilder sb; private StringBuilder line; private int extLinkNum; /** * Becomes true if we are no long at the Beginning Of the whole Document. */ private boolean pastBod; private int needNewlines; private boolean needSpace; private boolean noWrap; private LinkedList<Integer> sections; private List<InternalLinkWithOffsets> internalLinks = new ArrayList<InternalLinkWithOffsets>(); private List<RedirectWithOffsets> redirects = new ArrayList<RedirectWithOffsets>(); // ========================================================================= public TextConverter(WikiConfig config) { this.config = config; this.wrapCol = 100000; } @Override protected boolean before(WtNode node) { // This method is called by go() before visitation starts sb = new StringBuilder(); line = new StringBuilder(); extLinkNum = 1; pastBod = false; needNewlines = 0; needSpace = false; noWrap = true; sections = new LinkedList<Integer>(); return super.before(node); } @Override protected Object after(WtNode node, Object result) { finishLine(); // This method is called by go() after visitation has finished // The return value will be passed to go() which passes it to the caller return sb.toString(); } // ========================================================================= public void visit(WtNode n) { // Fallback for all nodes that are not explicitly handled below LOGGER.debug("fallback %s: %s", n.getClass().getName(), n.toString()); write("<"); write(n.getNodeName()); write(" />"); } public void visit(WtXmlEndTag xmlEndTag) { // do nothing } public void visit(WtTable table) { write('\n'); iterate(table.getBody()); write('\n'); } public void visit(WtTableImplicitTableBody body) { iterate(body.getBody()); } public void visit(WtBody body) { iterate(body); } public void visit(WtTableRow tableRow) { iterate(tableRow); write('\n'); } public void visit(WtTableHeader tableHeader) { iterate(tableHeader); } public void visit(WtTableCell tableCell) { iterate(tableCell); } public void visit(WtTableCaption tableCell) { iterate(tableCell); } public void visit(WtXmlAttribute xmlAttribute) { // do nothing } public void visit(WtRedirect redirect) { write("REDIRECT: "); int startOffset = getCurrentOffset(); WtPageName target = redirect.getTarget(); write(target.getAsString()); int endOffset = getCurrentOffset(); redirects.add(new RedirectWithOffsets(redirect, startOffset, endOffset)); } public void visit(WtNodeList n) { iterate(n); } public void visit(WtUnorderedList e) { iterate(e); } public void visit(WtOrderedList e) { iterate(e); } public void visit(WtListItem item) { newline(1); iterate(item); } public void visit(EngPage p) { iterate(p); } public void visit(WtText text) { write(text.getContent()); } public void visit(WtWhitespace w) { write(" "); } public void visit(WtBold b) { iterate(b); } public void visit(WtItalics i) { iterate(i); } public void visit(WtXmlCharRef cr) { write(Character.toChars(cr.getCodePoint())); } public void visit(WtXmlEntityRef er) { String ch = er.getResolved(); if (ch == null) { write('&'); write(er.getName()); write(';'); } else { write(ch); } } public void visit(WtUrl wtUrl) { if (!wtUrl.getProtocol().isEmpty()) { write(wtUrl.getProtocol()); write(':'); } write(wtUrl.getPath()); } public void visit(WtExternalLink link) { write('['); write(extLinkNum++); write("] "); iterate(link.getTitle()); write(" ("); write(link.getTarget().getProtocol()); write(':'); write(link.getTarget().getPath()); write(')'); } public void visit(WtInternalLink link) { int startOffset = getCurrentOffset(); try { if (link.getTarget().isResolved()) { PageTitle page = PageTitle.make(config, link.getTarget().getAsString()); if (page.getNamespace().equals(config.getNamespace("Category"))) return; } } catch (LinkTargetException e) { } write(link.getPrefix()); if (!link.hasTitle()) { iterate(link.getTarget()); } else { iterate(link.getTitle()); } write(link.getPostfix()); int endOffset = getCurrentOffset(); internalLinks.add(new InternalLinkWithOffsets(link, startOffset, endOffset)); } public void visit(WtSection s) { finishLine(); StringBuilder saveSb = sb; boolean saveNoWrap = noWrap; sb = new StringBuilder(); noWrap = true; iterate(s.getHeading()); finishLine(); String title = sb.toString().trim(); sb = saveSb; if (s.getLevel() >= 1) { while (sections.size() > s.getLevel()) sections.removeLast(); while (sections.size() < s.getLevel()) sections.add(1); StringBuilder sb2 = new StringBuilder(); for (int i = 0; i < sections.size(); ++i) { if (i < 1) continue; sb2.append(sections.get(i)); sb2.append('.'); } if (sb2.length() > 0) sb2.append(' '); sb2.append(title); title = sb2.toString(); } newline(2); write(title); newline(2); noWrap = saveNoWrap; iterate(s.getBody()); while (sections.size() > s.getLevel()) sections.removeLast(); sections.add(sections.removeLast() + 1); } public void visit(WtParagraph p) { iterate(p); newline(2); } public void visit(WtHorizontalRule hr) { newline(2); } public void visit(WtXmlElement e) { if (e.getName().equalsIgnoreCase("br")) { newline(1); } else { iterate(e.getBody()); } } // ========================================================================= // Stuff we want to hide public void visit(WtImageLink n) { } public void visit(WtIllegalCodePoint n) { } public void visit(WtXmlComment n) { } public void visit(WtTemplate n) { } public void visit(WtTemplateArgument n) { } public void visit(WtTemplateParameter n) { } public void visit(WtTagExtension n) { } public void visit(WtPageSwitch n) { } // ========================================================================= private void newline(int num) { if (pastBod) { if (num > needNewlines) needNewlines = num; } } private void wantSpace() { if (pastBod) needSpace = true; } private void finishLine() { sb.append(line.toString()); line.setLength(0); } private void writeNewlines(int num) { finishLine(); sb.append(StringUtils.strrep('\n', num)); needNewlines = 0; needSpace = false; } private void writeWord(String s) { int length = s.length(); if (length == 0) return; if (!noWrap && needNewlines <= 0) { if (needSpace) length += 1; if (line.length() + length >= wrapCol && line.length() > 0) writeNewlines(1); } if (needSpace && needNewlines <= 0) line.append(' '); if (needNewlines > 0) writeNewlines(needNewlines); needSpace = false; pastBod = true; line.append(s); } private void write(String s) { if (s.isEmpty()) return; if (Character.isSpaceChar(s.charAt(0))) wantSpace(); line.append(s); // String[] words = ws.split(s); // for (int i = 0; i < words.length; ) { // writeWord(words[i]); // if (++i < words.length) // wantSpace(); // } if (Character.isSpaceChar(s.charAt(s.length() - 1))) wantSpace(); } private void write(char[] cs) { write(String.valueOf(cs)); } private void write(char ch) { writeWord(String.valueOf(ch)); } private void write(int num) { writeWord(String.valueOf(num)); } public List<InternalLinkWithOffsets> getInternalLinks() { return internalLinks; } public List<RedirectWithOffsets> getRedirects() { return redirects; } public int getCurrentOffset() { return sb.length() + line.length(); } }