package com.formulasearchengine.mathosphere.mlp.text;
import com.formulasearchengine.mathosphere.mlp.pojos.MathTag;
import com.formulasearchengine.mathosphere.mlp.pojos.ParsedWikiDocument;
import com.formulasearchengine.mathosphere.mlp.pojos.WikidataLink;
import com.formulasearchengine.mathosphere.mlp.pojos.Word;
import org.eclipse.mylyn.wikitext.core.parser.MarkupParser;
import org.eclipse.mylyn.wikitext.mediawiki.core.MediaWikiLanguage;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class WikiTextUtils {
private static final Pattern MATH_TAG_PATTERN = Pattern.compile("<math.+?</math>", Pattern.DOTALL);
private static int i = 0;
public static List<MathTag> findMathTags(String text) {
List<MathTag> results = new ArrayList<>();
Matcher matcher = MATH_TAG_PATTERN.matcher(text);
while (matcher.find()) {
String tag = matcher.group();
MathMarkUpType markUp = guessMarkupType(tag);
if (markUp == MathMarkUpType.LATEX) {
tag = tag.replaceAll("<math>", "").replaceAll("</math>", "");
}
results.add(new MathTag(matcher.start(), tag, markUp));
// System.err.println(i+++":"+tag);
}
return results;
}
private static MathMarkUpType guessMarkupType(String math) {
int closingBracket = math.indexOf(">", "<math".length());
String afterTag = math.substring(closingBracket + 1, math.length()).trim();
if (afterTag.startsWith("<")) {
return MathMarkUpType.MATHML;
} else {
return MathMarkUpType.LATEX;
}
}
public static String replaceAllFormulas(String text, List<MathTag> mathTags) {
StringBuilder newText = new StringBuilder(text.length());
int offset = 0;
for (MathTag tag : mathTags) {
newText.append(text.substring(offset, tag.getPosition()));
newText.append(tag.placeholder());
offset = tag.getPosition() + tag.getContent().length();
if (tag.getMarkUpType() == MathMarkUpType.LATEX && !tag.getContent().startsWith("<math")) {
offset += 13; //<math></math>
}
}
newText.append(text.substring(offset, text.length()));
return newText.toString();
}
public static String renderAllFormulae(String text) {
return StringReplacer.replace(text, MATH_TAG_PATTERN, (Matcher m) ->
{
try {
return TeX2MathML.TeX2MML(m.group(0).replaceAll("<math.*?>", "").replaceAll("</math>", ""));
} catch (Exception e) {
e.printStackTrace();
return m.group(0);
}
}
);
}
public static String subsup(String markup) {
return markup.replaceAll("[{<]sub[}>](.+?)[{<]/sub[}>]", "_$1")
.replaceAll("[{<]sup[}>](.+?)[{<]/sup[}>]", "^$1");
}
public static String extractPlainText(String wikiMarkup) {
MarkupParser parser = new MarkupParser();
parser.setMarkupLanguage(new MediaWikiLanguage());
PlaintextDocumentBuilder builder = new PlaintextDocumentBuilder();
parser.setBuilder(builder);
parser.parse(wikiMarkup);
return builder.getResult();
}
/**
* Get a definiens from a link. I.e. convert LINK_******** to [[LinkContent]] or [[LinkDefiniens]].
* This method removes the explicit information where this link pints to and replaces it with a human readable representation.
*
* @param word Link the definiens is wanted for.
* @param doc Document containing the link.
* @return The underlying, human readable definiens.
*/
public static String deLinkify(Word word, ParsedWikiDocument doc) {
String definition;
if (word.getPosTag().equals(PosTag.LINK)) {
String hash = word.getWord().replaceAll("^LINK_", "");
WikidataLink link = doc.getLinkMap().get(hash);
if (link != null) {
definition = "[[" + link.getContent() + "]]";
} else {
definition = "[[" + word.getWord() + "]]";
}
} else {
definition = word.getWord();
}
return definition;
}
public enum MathMarkUpType {
LATEX, MATHML, MATH_TEMPLATE, MVAR_TEMPLATE, LATEXII
}
}