/**************************************************************************
OmegaT - Computer Assisted Translation (CAT) tool
with fuzzy matching, translation memory, keyword search,
glossaries, and translation leveraging into updated projects.
Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk
2006 Thomas Huriaux
2008 Martin Fleurke
2009 Arno Peters
2011 Didier Briel
2014 Adiel Mittmann
Home page: http://www.omegat.org/
Support center: http://groups.yahoo.com/group/OmegaT/
This file is part of OmegaT.
OmegaT is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OmegaT is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
**************************************************************************/
package org.omegat.filters2.latex;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.Writer;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.ListIterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.omegat.filters2.AbstractFilter;
import org.omegat.filters2.Instance;
import org.omegat.util.Log;
import org.omegat.util.OStrings;
/**
* Filter to support LaTeX files.
*
* @author Keith Godfrey
* @author Maxym Mykhalchuk
* @author Thomas Huriaux
* @author Martin Fleurke
* @author Arno Peters
* @author Didier Briel
* @author Adiel Mittmann
*/
public class LatexFilter extends AbstractFilter {
public String getFileFormatName() {
return OStrings.getString("LATEXFILTER_FILTER_NAME");
}
public Instance[] getDefaultInstances() {
return new Instance[] { new Instance("*.tex"), new Instance("*.latex"), };
}
public boolean isSourceEncodingVariable() {
return true;
}
public boolean isTargetEncodingVariable() {
return true;
}
@Override
protected boolean requirePrevNextFields() {
return true;
}
@Override
public void processFile(BufferedReader in, BufferedWriter out, org.omegat.filters2.FilterContext fc) throws IOException {
// BOM (byte order mark) bugfix
in.mark(1);
int ch = in.read();
if (ch != 0xFEFF)
in.reset();
init();
processLatexFile(in, out);
}
private int findStringCategory(String c) {
if (c.equals("\\")) {
return 0;
} else if (c.equals("{")) {
return 1;
} else if (c.equals("}")) {
return 2;
} else if (c.equals("$")) {
return 3;
} else if (c.equals("&")) {
return 4;
} else if (c.equals("\n")) {
return 5;
} else if (c.equals("#")) {
return 6;
} else if (c.equals("^")) {
return 7;
} else if (c.equals("_")) {
return 8;
} else if (c.equals("\000")) {
return 9;
} else if (c.matches("[ \t]")) {
return 10;
} else if (c.matches("[a-zA-Z]")) {
return 11;
} else if (c.equals("~")) {
return 13;
} else if (c.equals("%")) {
return 14;
}
return 12;
}
/**
* Processes a LaTeX document
*
* @param in
* Source document
* @param out
* Target document
* @throws java.io.IOException
*/
private void processLatexFile(BufferedReader in, Writer out) throws IOException {
StringBuilder par = new StringBuilder();
String s;
StringBuilder comment = new StringBuilder();
LinkedList<String> commands = new LinkedList<String>();
/**
* Possible states: N: beginning of a new line M: middle S: skipping
* blanks
*/
String state;
while ((s = in.readLine()) != null) {
// String[] c = s.split(""); In Java 8, that line gave a first empty element, so it was replaced with the
// following lines, and idx below was started at 0 instead of 1
String[] c;
if (!s.isEmpty()){
c = s.split("(?!^)");
} else {
c = new String[0];
}
state = "N";
int idx = 0;
while (idx < c.length) {
String cidx = c[idx];
int cat = findStringCategory(cidx);
if (cat == 0) {
/* parse control sequence */
StringBuilder cmd = new StringBuilder();
cmd.append(cidx);
idx++;
while (idx < c.length) {
String cmdc = c[idx];
if (findStringCategory(cmdc) == 11) {
cmd.append(cmdc);
} else if (cmd.length() == 1) {
cmd.append(cmdc);
state = "M";
break;
} else {
idx--;
// state = "S";
state = "M";
break;
}
idx++;
}
if (!commands.contains(cmd.toString()))
commands.add(cmd.toString());
par.append(cmd);
} else if (cat == 4) {
/* table column separator */
out.write(processParagraph(commands, par.toString()));
out.write("&");
par.setLength(0);
// System.out.println(commands);
commands.clear();
} else if (cat == 10) {
if (state.equals("M")) {
state = "S";
par.append(cidx);
}
} else if (cat == 14) {
/* parse comment */
comment.append(cidx);
idx++;
while (idx < c.length) {
String commentc = c[idx];
comment.append(commentc);
idx++;
}
} else {
state = "M";
par.append(cidx);
}
idx++;
}
/* at the end of the line */
if (state.equals("N")) {
/* \par */
if (par.length() > 0) {
out.write(processParagraph(commands, par.toString()));
out.write("\n\n");
par.setLength(0);
}
// System.out.println(commands);
commands.clear();
if (comment.length() > 0) { // If there is a comment, write it
out.write(comment.toString());
out.write("\n");
comment.setLength(0);
}
} else if (state.equals("M")) {
par.append(" ");
}
}
// output remaining buffers
if (par.length() > 0)
out.write(processParagraph(commands, par.toString()));
}
private String substituteUnicode(String par) {
par = par.replaceAll("\\\\\\\\", "<br0>");
par = par.replaceAll("\\{?\\\\ss\\}?", "\u00df");
par = par.replaceAll("\\{?\\\\glqq\\}?(\\{\\})?", "\u301f");
par = par.replaceAll("\\{?\\\\grqq\\}?(\\{\\})?", "\u301d");
par = par.replaceAll("\\{?\\\\glq\\}?(\\{\\})?", "\u201a");
par = par.replaceAll("\\{?\\\\grq\\}?(\\{\\})?", "\u2018");
par = par.replaceAll("\\\\%", "%");
par = par.replaceAll("\\\\-", "\u00ad");
par = par.replaceAll("\\\\,", "\u2009");
par = par.replaceAll("~", "\u00a0");
return par;
}
private String resubstituteTex(String par) {
par = par.replaceAll("\u00a0", "~");
par = par.replaceAll("\u2009", "\\\\,");
par = par.replaceAll("\u00ad", "\\\\-");
par = par.replaceAll("%", "\\\\%");
par = par.replaceAll("<br0>", "\\\\\\\\");
return par;
}
private LinkedList<String> oneArgNoText = new LinkedList<String>();
private LinkedList<String> oneArgInlineText = new LinkedList<String>();
private LinkedList<String> oneArgParText = new LinkedList<String>();
private void init() {
oneArgNoText.add("\\begin");
oneArgNoText.add("\\end");
oneArgNoText.add("\\cite");
oneArgNoText.add("\\label");
oneArgNoText.add("\\ref");
oneArgNoText.add("\\pageref");
oneArgNoText.add("\\pagestyle");
oneArgNoText.add("\\thispagestyle");
oneArgNoText.add("\\vspace");
oneArgNoText.add("\\hspace");
oneArgNoText.add("\\vskip");
oneArgNoText.add("\\hskip");
oneArgNoText.add("\\put");
oneArgNoText.add("\\includegraphics");
oneArgNoText.add("\\documentclass");
oneArgNoText.add("\\usepackage");
oneArgInlineText.add("\\emph");
oneArgInlineText.add("\\textbf");
oneArgInlineText.add("\\texttt");
oneArgInlineText.add("\\textsf");
oneArgInlineText.add("\\textit");
oneArgInlineText.add("\\hbox");
oneArgInlineText.add("\\mbox");
oneArgInlineText.add("\\vbox");
oneArgParText.add("\\typeout");
oneArgParText.add("\\footnote");
oneArgParText.add("\\author");
oneArgParText.add("\\index");
oneArgParText.add("\\title");
oneArgParText.add("\\Chapter");
oneArgParText.add("\\chapter");
oneArgParText.add("\\section");
}
private String replaceOneArgNoText(LinkedList<String[]> substituted, LinkedList<String> commands,
String par) {
int counter = 0;
for (Iterator<String> it = commands.iterator(); it.hasNext();) {
String command = it.next();
StringBuffer sb = new StringBuffer();
if (oneArgNoText.contains(command)) {
String find = ("\\" + command + "\\*?" + "(" + "\\[" + "[^\\]]*" + "\\]" + // opt
// []
// arg
"|" + "\\(" + "[^\\)]*" + "\\)" + // opt () arg
")?\\s*" + "\\{" + "[^\\}]*+" + "\\}");
Pattern p = Pattern.compile(find);
Matcher m = p.matcher(par);
while (m.find()) {
String replace = "<n" + String.valueOf(counter) + ">";
String[] subst = { reHarden(m.group(0)), reHarden(replace) };
substituted.addFirst(subst);
m.appendReplacement(sb, replace);
counter++;
}
m.appendTail(sb);
par = sb.toString();
}
}
return par;
}
private String replaceOneArgInlineText(LinkedList<String[]> substituted, LinkedList<String> commands,
String par) {
int counter = 0;
for (Iterator<String> it = commands.iterator(); it.hasNext();) {
String command = it.next();
StringBuffer sb = new StringBuffer();
if (oneArgInlineText.contains(command)) {
String find = ("(" + "\\" + command + "\\s*" + "\\{" + ")" + "(" + "[^\\}]*+" + ")" + "\\}");
Pattern p = Pattern.compile(find);
Matcher m = p.matcher(par);
while (m.find()) {
String preReplace = "<i" + String.valueOf(counter) + ">";
String postReplace = "</i" + String.valueOf(counter) + ">";
String[] s1 = { reHarden(m.group(1)), reHarden(preReplace) };
substituted.addFirst(s1);
String[] s2 = { reHarden("}"), reHarden(postReplace) };
substituted.addFirst(s2);
String replace = (preReplace + "$2" + postReplace);
m.appendReplacement(sb, replace);
counter++;
}
m.appendTail(sb);
par = sb.toString();
}
}
return par;
}
private String replaceOneArgParText(LinkedList<String[]> substituted, LinkedList<String> commands,
String par) {
int counter = 0;
for (Iterator<String> it = commands.iterator(); it.hasNext();) {
String command = it.next();
StringBuffer sb = new StringBuffer();
if (oneArgParText.contains(command)) {
String find = ("(" + "\\" + command + "\\*?\\s*" + ")" + "\\{" + "(" + "[^\\}]*+" + ")" + "\\}");
Pattern p = Pattern.compile(find);
Matcher m = p.matcher(par);
while (m.find()) {
String replace = "<p" + String.valueOf(counter) + ">";
String content = "";
if (m.group(2) != null)
content = processParagraph(commands, m.group(2));
String[] subst = { reHarden(m.group(1) + "{" + content + "}"), reHarden(replace) };
substituted.addFirst(subst);
m.appendReplacement(sb, replace);
counter++;
}
m.appendTail(sb);
par = sb.toString();
}
}
return par;
}
private String replaceUnknownCommand(LinkedList<String[]> substituted, LinkedList<String> commands,
String par) {
int counter = 0;
for (Iterator<String> it = commands.iterator(); it.hasNext();) {
String command = it.next();
if (command.equals("\\\\") || command.equals("\\{") || command.equals("\\[") || command.equals("\\|")) {
// continue;
command = "\\" + command;
}
StringBuffer sb = new StringBuffer();
String find = "\\" + command;
try {
Pattern p = Pattern.compile(find);
Matcher m = p.matcher(par);
while (m.find()) {
String replace = "<u" + String.valueOf(counter) + ">";
String[] subst = { reHarden(m.group(0)), reHarden(replace) };
substituted.addFirst(subst);
m.appendReplacement(sb, replace);
counter++;
}
m.appendTail(sb);
par = sb.toString();
} catch (java.util.regex.PatternSyntaxException e) {
//TODO: understand the exceptions
Log.log("LaTeX PatternSyntaxException: " + e.getMessage());
Log.log(command);
}
}
return par;
}
private String reHarden(String re) {
re = re.replaceAll("\\\\", "\\\\\\\\"); // replace \ with \\
re = re.replaceAll("\\[", "\\\\[");
re = re.replaceAll("\\^", "\\\\^");
re = re.replaceAll("\\$", "\\\\\\$");
re = re.replaceAll("\\{", "\\\\{");
return re;
}
private String processParagraph(LinkedList<String> commands, String par) {
LinkedList<String[]> substituted = new LinkedList<String[]>();
par = substituteUnicode(par);
par = replaceOneArgNoText(substituted, commands, par);
par = replaceOneArgInlineText(substituted, commands, par);
par = replaceOneArgParText(substituted, commands, par);
par = replaceUnknownCommand(substituted, commands, par);
String find = ("^((\\s*</?[nipu]\\d+>\\s*)*)" + "(.*?)" + "((\\s*</?[nipu]\\d+>\\s*)*)$");
Pattern p = Pattern.compile(find);
Matcher m = p.matcher(par);
if (m.find()) {
par = "";
if (m.group(1) != null)
par += m.group(1);
if (m.group(3) != null)
par += processEntry(m.group(3));
if (m.group(4) != null)
par += m.group(4);
}
par = resubstituteTex(par);
ListIterator<String[]> it = substituted.listIterator();
while (it.hasNext()) {
String[] subst = it.next();
par = par.replaceAll(subst[1], subst[0]);
}
return par;
}
}