package com.formulasearchengine.mathosphere.mlp.text;
/**
* Copyright 2011 The Open Source Research Group, University of Erlangen-Nürnberg <p> Licensed under
* the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at <p> http://www.apache.org/licenses/LICENSE-2.0
* <p> Unless required by applicable law or agreed to in writing, software distributed under the
* License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.google.common.collect.Multiset;
import com.formulasearchengine.mathosphere.mlp.cli.BaseConfig;
import com.formulasearchengine.mathosphere.mlp.contracts.TextExtractorMapper;
import com.formulasearchengine.mathosphere.mlp.pojos.MathTag;
import com.formulasearchengine.mathosphere.mlp.pojos.WikidataLink;
import com.jcabi.log.Logger;
import de.fau.cs.osr.ptk.common.AstVisitor;
import de.fau.cs.osr.utils.StringUtils;
import org.sweble.wikitext.engine.EngineException;
import org.sweble.wikitext.engine.PageId;
import org.sweble.wikitext.engine.PageTitle;
import org.sweble.wikitext.engine.WtEngineImpl;
import org.sweble.wikitext.engine.config.WikiConfig;
import org.sweble.wikitext.engine.nodes.EngPage;
import org.sweble.wikitext.engine.nodes.EngProcessedPage;
import org.sweble.wikitext.engine.utils.DefaultConfigEnWp;
import org.sweble.wikitext.parser.nodes.*;
import org.sweble.wikitext.parser.nodes.WtContentNode.WtContentNodeImpl;
import org.sweble.wikitext.parser.parser.LinkTargetException;
import org.xml.sax.SAXException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Pattern;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.xpath.XPathExpressionException;
/**
* A visitor to convert an article AST into a pure text representation. To better understand the
* visitor pattern as implemented by the Visitor class, please take a look at the following
* resources: <ul> <li><a href="http://en.wikipedia.org/wiki/Visitor_pattern">http://en.wikipedia
* .org/wiki/Visitor_pattern</a> (classic pattern)</li> <li><a href="http://www.javaworld.com/javaworld/javatips/jw-javatip98.html">http
* ://www.javaworld.com/javaworld/javatips/jw-javatip98.html</a> (the version we use here)</li>
* </ul>
* <p>
* The methods needed to descend into an AST and visit the children of a given node <code>n</code>
* are <ul> <li><code>dispatch(n)</code> - visit node <code>n</code>,</li>
* <li><code>iterate(n)</code> - visit the <b>children</b> of node <code>n</code>,</li>
* <li><code>map(n)</code> - visit the <b>children</b> of node <code>n</code> and gather the return
* values of the <code>visit()</code> calls in a list,</li> <li><code>mapInPlace(n)</code> - visit
* the <b>children</b> of node <code>n</code> and replace each child node <code>c</code> with the
* return value of the call to <code>visit(c)</code>.</li> </ul>
*/
public class MathConverter
extends
AstVisitor<WtNode> {
private final static Pattern subMatch = Pattern.compile("[{<]sub[}>](.+?)[{<]/sub[}>]");
private final static WikiConfig config = DefaultConfigEnWp.generate();
private final static WtEngineImpl engine = new WtEngineImpl(config);
private static final Pattern ws = Pattern.compile("\\s+");
private final EngProcessedPage page;
private List<MathTag> mathTags = new ArrayList<>();
private List<WikidataLink> links = new ArrayList<>();
private StringBuilder sb;
private StringBuilder line;
private int extLinkNum;
private WikidataLinkMap wl = null;
private static int i = 0;
/**
* Becomes true if we are no long at the Beginning Of the whole Document.
*/
private boolean pastBod;
private int needNewlines;
private boolean needSpace;
private boolean noWrap;
private LinkedList<Integer> sections;
private PageTitle pageTitle;
private String texInfoUrl;
public MathConverter(String wikiText, String name) throws LinkTargetException, EngineException {
pageTitle = PageTitle.make(config, name);
PageId pageId = new PageId(pageTitle, -1);
page = engine.postprocess(pageId, wikiText, null);
texInfoUrl = (new BaseConfig()).getTexvcinfoUrl();
}
public MathConverter(String wikiText) throws LinkTargetException, EngineException {
this(wikiText, "noname");
}
public MathConverter(String wikitext, String title, BaseConfig config) throws LinkTargetException, EngineException {
this(wikitext, title);
if (config.getWikiDataFile() != null) {
wl = new WikidataLinkMap(config.getWikiDataFile());
} else {
wl = null;
}
texInfoUrl = config.getTexvcinfoUrl();
}
public List<MathTag> getMathTags() {
return mathTags;
}
// =========================================================================
@Override
protected boolean before(WtNode node) {
// This method is called by go() before visitation starts
sb = new StringBuilder();
line = new StringBuilder();
extLinkNum = 1;
pastBod = false;
needNewlines = 0;
needSpace = false;
noWrap = false;
sections = new LinkedList<>();
return super.before(node);
}
@Override
protected Object after(WtNode node, Object result) {
finishLine();
// This method is called by go() after visitation has finished
// The return value will be passed to go() which passes it to the caller
return sb.toString();
}
// =========================================================================
public void visit(WtNode n) {
// Fallback for all nodes that are not explicitly handled below
// System.out.println(n.getNodeName());
// write("<");
// write(n.getNodeName());
// write(" />");
}
public void visit(WtNodeList n) {
iterate(n);
}
public void visit(WtUnorderedList e) {
iterate(e);
}
public void visit(WtOrderedList e) {
iterate(e);
}
public void visit(WtListItem item) {
writeNewlines(1);
iterate(item);
}
public void visit(EngPage p) {
iterate(p);
}
public void visit(WtText text) {
write(text.getContent());
}
public void visit(WtWhitespace w) {
write(" ");
}
public void visit(WtBold b) {
if (detectHiddenMath(b)) {
return;
}
write("\"");
iterate(b);
write("\"");
}
public void visit(WtItalics i) {
if (detectHiddenMath(i)) return;
write("\"");
iterate(i);
write("\"");
}
public boolean detectHiddenMath(WtNode i) {
if (i.size() == 1 && i.get(0) instanceof WtText) {
final String tex = getTex(i, false);
if (tex != null) {
int location;
try {
location = i.getLocation().line;
} catch (NullPointerException n) {
location = 0;
}
MathTag tag = new MathTag(location, tex, WikiTextUtils.MathMarkUpType.MATH_TEMPLATE);
mathTags.add(tag);
needSpace = true;
writeWord(tag.placeholder());
needSpace = true;
return true;
}
} else {
if (i.size() == 2 && i.get(0) instanceof WtText && i.get(1) instanceof WtXmlElement) {
//discover hidden subscripts
final WtXmlElement xml = (WtXmlElement) i.get(1);
if (xml.getName().matches("sub") &&
xml.getBody().size() == 1 &&
xml.getBody().get(0) instanceof WtText) {
//String subtext = ((WtText) ((WtXmlElement) i.get(1)).getBody().get(0)).getContent();
final String subTex = getTex((WtContentNodeImpl) xml.getBody(), true);
final String mainTex = getTex(i, true);
if (mainTex != null) {
String tex = mainTex + "_{" + subTex + "}";
int location;
try {
location = i.getLocation().line;
} catch (NullPointerException n) {
location = 0;
}
MathTag tag = new MathTag(location, tex, WikiTextUtils.MathMarkUpType.MATH_TEMPLATE);
mathTags.add(tag);
needSpace = true;
writeWord(tag.placeholder());
needSpace = true;
return true;
}
}
}
}
return false;
}
public void visit(WtXmlCharRef cr) {
write(Character.toChars(cr.getCodePoint()));
}
public void visit(WtXmlEntityRef er) {
String ch = er.getResolved();
if (ch == null) {
write('&');
write(er.getName());
write(';');
} else {
write(ch);
}
}
public void visit(WtUrl wtUrl) {
if (!wtUrl.getProtocol().isEmpty()) {
write(wtUrl.getProtocol());
write(':');
}
write(wtUrl.getPath());
}
public void visit(WtExternalLink link) {
write('[');
write(extLinkNum++);
write(']');
}
public void visit(WtInternalLink link) {
String linkName = link.getTarget().getAsString().split("#")[0];
if (wl != null) {
String newName = wl.title2Data(linkName);
if (newName != null) {
write("LINK_" + newName);
return;
}
}
WikidataLink wl = new WikidataLink(linkName);
write("LINK_" + wl.getContentHash());
needSpace = true;
if (link.getTitle().size() > 0) {
StringBuilder tmp = this.line;
this.line = new StringBuilder();
iterate(link.getTitle());
wl.setTitle(this.line.toString());
this.line = tmp;
}
links.add(wl);
}
public void visit(WtSection s) {
finishLine();
StringBuilder saveSb = sb;
boolean saveNoWrap = noWrap;
sb = new StringBuilder();
noWrap = true;
iterate(s.getHeading());
finishLine();
String title = sb.toString().trim();
sb = saveSb;
if (s.getLevel() >= 1) {
while (sections.size() > s.getLevel())
sections.removeLast();
while (sections.size() < s.getLevel())
sections.add(1);
StringBuilder sb2 = new StringBuilder();
for (int i = 0; i < sections.size(); ++i) {
if (i < 1)
continue;
sb2.append(sections.get(i));
sb2.append('.');
}
if (sb2.length() > 0)
sb2.append(' ');
sb2.append(title);
title = sb2.toString();
}
newline(2);
write(title);
newline(1);
write(StringUtils.strrep('-', title.length()));
newline(2);
noWrap = saveNoWrap;
try {
// Don't care about errors
iterate(s.getBody());
} catch (Exception e) {
Logger.info(e, "Problem prcessing page", pageTitle.getTitle());
e.printStackTrace();
}
while (sections.size() > s.getLevel())
sections.removeLast();
sections.add(sections.removeLast() + 1);
}
public void visit(WtParagraph p) {
iterate(p);
newline(2);
}
public void visit(WtHorizontalRule hr) {
newline(1);
write(StringUtils.strrep('-', 10));
newline(2);
}
public void visit(WtXmlElement e) {
if (e.getName().equalsIgnoreCase("br")) {
newline(1);
} else if (e.getName().equalsIgnoreCase("var")) {
WtNode wtNodes = e.getBody().get(0);
String content;
if (wtNodes instanceof WtText) {
content = ((WtText) wtNodes).getContent().trim();
handeLatexMathTag(e, content);
} else if (wtNodes instanceof WtInternalLink) {
//TODO: do not throw away the information of the link from WtInternalLink.getTarget()
//Identifier is more important than link. Link maybe helpful for wikidata.
content = ((WtText) ((WtInternalLink) e.getBody().get(0)).getTitle().get(0)).getContent().trim();
handeLatexMathTag(e, content);
}
} else {
iterate(e.getBody());
}
}
// =========================================================================
// Stuff we want to hide
public void visit(WtImageLink n) {
iterate(n.getTitle());
}
public void visit(WtIllegalCodePoint n) {
}
public void visit(WtXmlComment n) {
}
public void visit(WtTable b) {
iterate(b.getBody());
}
public void visit(WtTableRow b) {
iterate(b);
}
public void visit(WtTableCell b) {
iterate(b);
}
public void visit(WtTableImplicitTableBody b) {
iterate(b);
}
public void visit(WtTableHeader b) {
iterate(b);
}
public void visit(WtTableCaption b) {
iterate(b);
}
public void visit(WtNewline n) {
writeNewlines(1);
}
public void visit(WtTemplate n) {
try {
WtTemplateArgument arg0;
String content;
switch (n.getName().getAsString().toLowerCase()) {
case "math":
arg0 = (WtTemplateArgument) n.getArgs().get(0);
content = ((WtText) arg0.getValue().get(0)).getContent().trim();
handeLatexMathTag(n, content);
break;
case "mvar":
arg0 = (WtTemplateArgument) n.getArgs().get(0);
content = ((WtText) arg0.getValue().get(0)).getContent().trim();
content = wiki2Tex(content);
MathTag tag = new MathTag(n.getLocation().line, content, WikiTextUtils.MathMarkUpType.MVAR_TEMPLATE);
mathTags.add(tag);
needSpace = true;
writeWord(tag.placeholder());
needSpace = true;
break;
default:
iterate(n.getArgs());
}
} catch (Exception e) {
Logger.info(e, "Problem prcessing page", pageTitle.getTitle());
}
}
public void handeLatexMathTag(WtNode n, String content) {
content = TextExtractorMapper.unescape(content);
//content = content.replaceAll("'''([a-zA-Z]+)'''","\\mathbf{$1}");
content = wiki2Tex(content);
int location = 0;
try {
location = n.getLocation().line;
} catch (Exception ignored) {
//we don't really need this
}
MathTag tag = new MathTag(location, content, WikiTextUtils.MathMarkUpType.MATH_TEMPLATE);
mathTags.add(tag);
needSpace = true;
writeWord(tag.placeholder());
needSpace = true;
}
public String wiki2Tex(String content) {
content = subMatch.matcher(content).replaceAll("_{$1}")
.replaceAll("[{<]sup[}>](.+?)[{<]/sup[}>]", "^{$1}")
.replaceAll("'''(.+?)'''", "\\\\mathbf{$1}")
.replaceAll("''(.+?)''", "\\\\mathit{$1}");
return UnicodeMap.string2TeX(content);
// int[] chars = content.codePoints().toArray();
// StringBuilder res = new StringBuilder();
//
// for (int code : chars) {
// if (code > 128) {
// res.append(UnicodeMap.char2TeX(code));
// } else {
// res.append((char) code);
// }
// }
// return res.toString().trim();
}
public void visit(WtTemplateArgument n) {
if (!detectHiddenMath(n.getValue())) {
iterate(n.getValue());
}
}
public void visit(WtTemplateParameter n) {
}
public void visit(WtTagExtension n) {
if (n.getName().equals("math")) {
MathTag tag = new MathTag(n.getLocation().line, n.getBody().getContent(), WikiTextUtils.MathMarkUpType.LATEX);
// System.err.println(i+++" : "+ n.getBody().getContent());
mathTags.add(tag);
if (needNewlines > 0) {
write(" ");
}
needSpace = true;
writeWord(tag.placeholder());
needSpace = true;
} else if (n.getName().equals("ref")) {
String content = n.getBody().getContent();
if (!content.contains("<math")) {
return;
}
final List<MathTag> tags = WikiTextUtils.findMathTags(content);
content = WikiTextUtils.replaceAllFormulas(content, tags);
mathTags.addAll(tags);
write("(");
write(content);
write("}");
}
}
public void visit(WtPageSwitch n) {
}
// =========================================================================
private void newline(int num) {
if (pastBod) {
if (num > needNewlines)
needNewlines = num;
}
}
private void wantSpace() {
if (pastBod)
needSpace = true;
}
private void finishLine() {
sb.append(line.toString());
sb.append(" ");
line.setLength(0);
}
private void writeNewlines(int num) {
finishLine();
sb.append(StringUtils.strrep('\n', num));
needNewlines = 0;
needSpace = false;
}
private void writeWord(String s) {
int length = s.length();
if (length == 0)
return;
if (needSpace && needNewlines <= 0)
line.append(' ');
if (needNewlines > 0)
writeNewlines(needNewlines);
needSpace = false;
pastBod = true;
line.append(s);
}
private void write(String s) {
if (s.isEmpty())
return;
if (Character.isSpaceChar(s.charAt(0)))
wantSpace();
String[] words = ws.split(s);
for (int i = 0; i < words.length; ) {
writeWord(words[i]);
if (++i < words.length)
wantSpace();
}
final char lastChar = s.charAt(s.length() - 1);
if (Character.isSpaceChar(lastChar) || lastChar == '\n')
wantSpace();
}
private void write(char[] cs) {
write(String.valueOf(cs));
}
private void write(char ch) {
writeWord(String.valueOf(ch));
}
private void write(int num) {
writeWord(String.valueOf(num));
}
public String getStrippedOutput() {
return (String) this.go(page.getPage());
}
public String getOutput() {
String output = getStrippedOutput();
for (WikidataLink link : links) {
if (link.getTitle() == null) {
output = output.replace("LINK_" + link.getContentHash(), "[[" + link.getContent() + "]]");
} else {
output = output.replace("LINK_" + link.getContentHash(), "[[" + link.getContent() + "|" + link.getTitle() + "]]");
}
}
for (MathTag tag : mathTags) {
output = output.replace("FORMULA_" + tag.getContentHash(),
"<math>" + tag.getContent() + "</math>");
}
return output;
}
public List<WikidataLink> getLinks() {
return links;
}
private String getTex(WtNode i, boolean force) {
if (i.get(0) instanceof WtText) {
String content = ((WtText) i.get(0)).getContent().trim();
content = TextExtractorMapper.unescape(content);
String tex = wiki2Tex(content);
if (tex.length() > 0 && (content.length() == 1
|| (content.length() < 100 && !content.equals(tex)))) {
Multiset<String> idents;
try {
idents = TexInfo.getIdentifiers(tex, texInfoUrl);
} catch (XPathExpressionException | ParserConfigurationException | IOException
| SAXException | TransformerException ignored) {
return null;
}
if (idents.size() == 0 && !force) {
return null;
}
if (i instanceof WtBold) {
tex = "\\mathbf{" + tex + "}";
}
return tex;
}
if (force) {
return tex;
}
}
return null;
}
}