package org.jabref.logic.formatter.bibtexfields;
import java.util.Objects;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jabref.logic.l10n.Localization;
import org.jabref.logic.layout.LayoutFormatter;
import org.jabref.logic.util.strings.HTMLUnicodeConversionMaps;
import org.jabref.model.cleanup.Formatter;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class HtmlToLatexFormatter implements LayoutFormatter, Formatter {
private static final Log LOGGER = LogFactory.getLog(HtmlToLatexFormatter.class);
private static final int MAX_TAG_LENGTH = 100;
private static final Pattern ESCAPED_PATTERN = Pattern.compile("([x]*)([0]*)(\\p{XDigit}+);");
private static final Pattern ESCAPED_PATTERN2 = Pattern.compile("(.)([x]*)([0]*)(\\p{XDigit}+);");
private static final Pattern ESCAPED_PATTERN3 = Pattern.compile("([x]*)([0]*)(\\p{XDigit}+);");
private static final Pattern ESCAPED_PATTERN4 = Pattern.compile("&(\\w+);");
@Override
public String format(String text) {
String result = Objects.requireNonNull(text);
if (result.isEmpty()) {
return result;
}
StringBuilder sb = new StringBuilder();
// Deal with the form <sup>k</sup>and <sub>k</sub>
result = result.replaceAll("<[ ]?sup>([^<]+)</sup>", "\\\\textsuperscript\\{$1\\}");
result = result.replaceAll("<[ ]?sub>([^<]+)</sub>", "\\\\textsubscript\\{$1\\}");
// TODO: maybe rewrite this based on regular expressions instead
// Note that (at least) the IEEE Xplore fetcher must be fixed as it relies on the current way to
// remove tags for its image alt-tag to equation converter
for (int i = 0; i < result.length(); i++) {
int c = result.charAt(i);
if (c == '<') {
i = readTag(result, i);
} else {
sb.append((char) c);
}
}
result = sb.toString();
// Handle text based HTML entities
Set<String> patterns = HTMLUnicodeConversionMaps.HTML_LATEX_CONVERSION_MAP.keySet();
for (String pattern : patterns) {
result = result.replace(pattern, HTMLUnicodeConversionMaps.HTML_LATEX_CONVERSION_MAP.get(pattern));
}
// Handle numerical HTML entities
Matcher m = ESCAPED_PATTERN.matcher(result);
while (m.find()) {
int num = Integer.decode(m.group(1).replace("x", "#") + m.group(3));
if (HTMLUnicodeConversionMaps.NUMERICAL_LATEX_CONVERSION_MAP.containsKey(num)) {
result = result.replace("" + m.group(1) + m.group(2) + m.group(3) + ";",
HTMLUnicodeConversionMaps.NUMERICAL_LATEX_CONVERSION_MAP.get(num));
}
}
// Combining accents
m = ESCAPED_PATTERN2.matcher(result);
while (m.find()) {
int num = Integer.decode(m.group(2).replace("x", "#") + m.group(4));
if (HTMLUnicodeConversionMaps.ESCAPED_ACCENTS.containsKey(num)) {
if ("i".equals(m.group(1))) {
result = result.replace(m.group(1) + "" + m.group(2) + m.group(3) + m.group(4) + ";",
"{\\" + HTMLUnicodeConversionMaps.ESCAPED_ACCENTS.get(num) + "{\\i}}");
} else if ("j".equals(m.group(1))) {
result = result.replace(m.group(1) + "" + m.group(2) + m.group(3) + m.group(4) + ";",
"{\\" + HTMLUnicodeConversionMaps.ESCAPED_ACCENTS.get(num) + "{\\j}}");
} else {
result = result.replace(m.group(1) + "" + m.group(2) + m.group(3) + m.group(4) + ";",
"{\\" + HTMLUnicodeConversionMaps.ESCAPED_ACCENTS.get(num) + "{" + m.group(1) + "}}");
}
}
}
// Find non-converted numerical characters
m = ESCAPED_PATTERN3.matcher(result);
while (m.find()) {
int num = Integer.decode(m.group(1).replace("x", "#") + m.group(3));
LOGGER.warn("HTML escaped char not converted: " + m.group(1) + m.group(2) + m.group(3) + " = " + Integer.toString(num));
}
// Remove $$ in case of two adjacent conversions
result = result.replace("$$", "");
// Find non-covered special characters with alphabetic codes
m = ESCAPED_PATTERN4.matcher(result);
while (m.find()) {
LOGGER.warn("HTML escaped char not converted: " + m.group(1));
}
return result.trim();
}
@Override
public String getDescription() {
return Localization.lang("Converts HTML code to LaTeX code.");
}
@Override
public String getExampleInput() {
return "<strong>JabRef</strong>";
}
private int readTag(String text, int position) {
// Have just read the < character that starts the tag.
int index = text.indexOf('>', position);
if ((index > position) && ((index - position) < MAX_TAG_LENGTH)) {
return index; // Just skip the tag.
} else {
return position; // Don't do anything.
}
}
@Override
public String getName() {
return Localization.lang("HTML to LaTeX");
}
@Override
public String getKey() {
return "html_to_latex";
}
}