LatexFilter.java example

Explorer
OmegaT-master
/**************************************************************************
 OmegaT - Computer Assisted Translation (CAT) tool
          with fuzzy matching, translation memory, keyword search,
          glossaries, and translation leveraging into updated projects.

 Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk
               2006 Thomas Huriaux
               2008 Martin Fleurke
               2009 Arno Peters
               2011 Didier Briel
               2014 Adiel Mittmann
               Home page: http://www.omegat.org/
               Support center: http://groups.yahoo.com/group/OmegaT/

 This file is part of OmegaT.

 OmegaT is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 OmegaT is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 **************************************************************************/

package org.omegat.filters2.latex;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.Writer;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.ListIterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.omegat.filters2.AbstractFilter;
import org.omegat.filters2.Instance;
import org.omegat.util.Log;
import org.omegat.util.OStrings;

/**
 * Filter to support LaTeX files.
 *
 * @author Keith Godfrey
 * @author Maxym Mykhalchuk
 * @author Thomas Huriaux
 * @author Martin Fleurke
 * @author Arno Peters
 * @author Didier Briel
 * @author Adiel Mittmann
 */
public class LatexFilter extends AbstractFilter {

    public String getFileFormatName() {
        return OStrings.getString("LATEXFILTER_FILTER_NAME");
    }

    public Instance[] getDefaultInstances() {
        return new Instance[] { new Instance("*.tex"), new Instance("*.latex"), };
    }

    public boolean isSourceEncodingVariable() {
        return true;
    }

    public boolean isTargetEncodingVariable() {
        return true;
    }

    @Override
    protected boolean requirePrevNextFields() {
        return true;
    }

    @Override
    public void processFile(BufferedReader in, BufferedWriter out, org.omegat.filters2.FilterContext fc) throws IOException {
        // BOM (byte order mark) bugfix
        in.mark(1);
        int ch = in.read();
        if (ch != 0xFEFF)
            in.reset();

        init();

        processLatexFile(in, out);
    }

    private int findStringCategory(String c) {
        if (c.equals("\\")) {
            return 0;
        } else if (c.equals("{")) {
            return 1;
        } else if (c.equals("}")) {
            return 2;
        } else if (c.equals("$")) {
            return 3;
        } else if (c.equals("&")) {
            return 4;
        } else if (c.equals("\n")) {
            return 5;
        } else if (c.equals("#")) {
            return 6;
        } else if (c.equals("^")) {
            return 7;
        } else if (c.equals("_")) {
            return 8;
        } else if (c.equals("\000")) {
            return 9;
        } else if (c.matches("[ \t]")) {
            return 10;
        } else if (c.matches("[a-zA-Z]")) {
            return 11;
        } else if (c.equals("~")) {
            return 13;
        } else if (c.equals("%")) {
            return 14;
        }

        return 12;
    }

    /**
     * Processes a LaTeX document
     *
     * @param in
     *            Source document
     * @param out
     *            Target document
     * @throws java.io.IOException
     */
    private void processLatexFile(BufferedReader in, Writer out) throws IOException {
        StringBuilder par = new StringBuilder();
        String s;
        StringBuilder comment = new StringBuilder();

        LinkedList<String> commands = new LinkedList<String>();

        /**
         * Possible states: N: beginning of a new line M: middle S: skipping
         * blanks
         */
        String state;
        while ((s = in.readLine()) != null) {
            // String[] c = s.split(""); In Java 8, that line gave a first empty element, so it was replaced with the
            // following lines, and idx below was started at 0 instead of 1
            String[] c;
            if (!s.isEmpty()){
              c = s.split("(?!^)");
            } else {
              c = new String[0];
            }
            state = "N";

            int idx = 0;
            while (idx < c.length) {
                String cidx = c[idx];
                int cat = findStringCategory(cidx);

                if (cat == 0) {
                    /* parse control sequence */
                    StringBuilder cmd = new StringBuilder();
                    cmd.append(cidx);
                    idx++;
                    while (idx < c.length) {
                        String cmdc = c[idx];
                        if (findStringCategory(cmdc) == 11) {
                            cmd.append(cmdc);
                        } else if (cmd.length() == 1) {
                            cmd.append(cmdc);
                            state = "M";
                            break;
                        } else {
                            idx--;
                            // state = "S";
                            state = "M";
                            break;
                        }
                        idx++;
                    }

                    if (!commands.contains(cmd.toString()))
                        commands.add(cmd.toString());
                    par.append(cmd);
                } else if (cat == 4) {
                    /* table column separator */
                    out.write(processParagraph(commands, par.toString()));
                    out.write("&");
                    par.setLength(0);
                    // System.out.println(commands);
                    commands.clear();
                } else if (cat == 10) {
                    if (state.equals("M")) {
                        state = "S";
                        par.append(cidx);
                    }
                } else if (cat == 14) {
                    /* parse comment */
                    comment.append(cidx);
                    idx++;
                    while (idx < c.length) {
                        String commentc = c[idx];
                        comment.append(commentc);
                        idx++;
                    }
                } else {
                    state = "M";
                    par.append(cidx);
                }

                idx++;
            }

            /* at the end of the line */
            if (state.equals("N")) {
                /* \par */
                if (par.length() > 0) {
                    out.write(processParagraph(commands, par.toString()));
                    out.write("\n\n");
                    par.setLength(0);
                }
                // System.out.println(commands);
                commands.clear();
                if (comment.length() > 0) { // If there is a comment, write it
                     out.write(comment.toString());
                     out.write("\n");
                     comment.setLength(0);
                }
            } else if (state.equals("M")) {
                par.append(" ");
            }
        }

        // output remaining buffers
        if (par.length() > 0)
            out.write(processParagraph(commands, par.toString()));

    }

    private String substituteUnicode(String par) {
        par = par.replaceAll("\\\\\\\\", "<br0>");
        par = par.replaceAll("\\{?\\\\ss\\}?", "\u00df");
        par = par.replaceAll("\\{?\\\\glqq\\}?(\\{\\})?", "\u301f");
        par = par.replaceAll("\\{?\\\\grqq\\}?(\\{\\})?", "\u301d");
        par = par.replaceAll("\\{?\\\\glq\\}?(\\{\\})?", "\u201a");
        par = par.replaceAll("\\{?\\\\grq\\}?(\\{\\})?", "\u2018");
        par = par.replaceAll("\\\\%", "%");
        par = par.replaceAll("\\\\-", "\u00ad");
        par = par.replaceAll("\\\\,", "\u2009");
        par = par.replaceAll("~", "\u00a0");
        return par;
    }

    private String resubstituteTex(String par) {
        par = par.replaceAll("\u00a0", "~");
        par = par.replaceAll("\u2009", "\\\\,");
        par = par.replaceAll("\u00ad", "\\\\-");
        par = par.replaceAll("%", "\\\\%");
        par = par.replaceAll("<br0>", "\\\\\\\\");
        return par;
    }

    private LinkedList<String> oneArgNoText = new LinkedList<String>();
    private LinkedList<String> oneArgInlineText = new LinkedList<String>();
    private LinkedList<String> oneArgParText = new LinkedList<String>();

    private void init() {
        oneArgNoText.add("\\begin");
        oneArgNoText.add("\\end");
        oneArgNoText.add("\\cite");
        oneArgNoText.add("\\label");
        oneArgNoText.add("\\ref");
        oneArgNoText.add("\\pageref");
        oneArgNoText.add("\\pagestyle");
        oneArgNoText.add("\\thispagestyle");
        oneArgNoText.add("\\vspace");
        oneArgNoText.add("\\hspace");
        oneArgNoText.add("\\vskip");
        oneArgNoText.add("\\hskip");
        oneArgNoText.add("\\put");
        oneArgNoText.add("\\includegraphics");
        oneArgNoText.add("\\documentclass");
        oneArgNoText.add("\\usepackage");

        oneArgInlineText.add("\\emph");
        oneArgInlineText.add("\\textbf");
        oneArgInlineText.add("\\texttt");
        oneArgInlineText.add("\\textsf");
        oneArgInlineText.add("\\textit");
        oneArgInlineText.add("\\hbox");
        oneArgInlineText.add("\\mbox");
        oneArgInlineText.add("\\vbox");

        oneArgParText.add("\\typeout");
        oneArgParText.add("\\footnote");
        oneArgParText.add("\\author");
        oneArgParText.add("\\index");
        oneArgParText.add("\\title");
        oneArgParText.add("\\Chapter");
        oneArgParText.add("\\chapter");
        oneArgParText.add("\\section");
    }

    private String replaceOneArgNoText(LinkedList<String[]> substituted, LinkedList<String> commands,
            String par) {
        int counter = 0;

        for (Iterator<String> it = commands.iterator(); it.hasNext();) {
            String command = it.next();

            StringBuffer sb = new StringBuffer();

            if (oneArgNoText.contains(command)) {
                String find = ("\\" + command + "\\*?" + "(" + "\\[" + "[^\\]]*" + "\\]" + // opt
                                                                                           // []
                                                                                           // arg
                        "|" + "\\(" + "[^\\)]*" + "\\)" + // opt () arg
                        ")?\\s*" + "\\{" + "[^\\}]*+" + "\\}");

                Pattern p = Pattern.compile(find);
                Matcher m = p.matcher(par);
                while (m.find()) {
                    String replace = "<n" + String.valueOf(counter) + ">";
                    String[] subst = { reHarden(m.group(0)), reHarden(replace) };
                    substituted.addFirst(subst);
                    m.appendReplacement(sb, replace);
                    counter++;
                }
                m.appendTail(sb);

                par = sb.toString();
            }
        }
        return par;
    }

    private String replaceOneArgInlineText(LinkedList<String[]> substituted, LinkedList<String> commands,
            String par) {
        int counter = 0;

        for (Iterator<String> it = commands.iterator(); it.hasNext();) {
            String command = it.next();

            StringBuffer sb = new StringBuffer();

            if (oneArgInlineText.contains(command)) {
                String find = ("(" + "\\" + command + "\\s*" + "\\{" + ")" + "(" + "[^\\}]*+" + ")" + "\\}");

                Pattern p = Pattern.compile(find);
                Matcher m = p.matcher(par);
                while (m.find()) {
                    String preReplace = "<i" + String.valueOf(counter) + ">";
                    String postReplace = "</i" + String.valueOf(counter) + ">";

                    String[] s1 = { reHarden(m.group(1)), reHarden(preReplace) };
                    substituted.addFirst(s1);

                    String[] s2 = { reHarden("}"), reHarden(postReplace) };
                    substituted.addFirst(s2);

                    String replace = (preReplace + "$2" + postReplace);
                    m.appendReplacement(sb, replace);
                    counter++;
                }
                m.appendTail(sb);

                par = sb.toString();
            }
        }
        return par;
    }

    private String replaceOneArgParText(LinkedList<String[]> substituted, LinkedList<String> commands,
            String par) {
        int counter = 0;

        for (Iterator<String> it = commands.iterator(); it.hasNext();) {
            String command = it.next();

            StringBuffer sb = new StringBuffer();

            if (oneArgParText.contains(command)) {
                String find = ("(" + "\\" + command + "\\*?\\s*" + ")" + "\\{" + "(" + "[^\\}]*+" + ")" + "\\}");

                Pattern p = Pattern.compile(find);
                Matcher m = p.matcher(par);
                while (m.find()) {
                    String replace = "<p" + String.valueOf(counter) + ">";
                    String content = "";
                    if (m.group(2) != null)
                        content = processParagraph(commands, m.group(2));

                    String[] subst = { reHarden(m.group(1) + "{" + content + "}"), reHarden(replace) };

                    substituted.addFirst(subst);
                    m.appendReplacement(sb, replace);
                    counter++;
                }
                m.appendTail(sb);

                par = sb.toString();
            }
        }
        return par;
    }

    private String replaceUnknownCommand(LinkedList<String[]> substituted, LinkedList<String> commands,
            String par) {
        int counter = 0;

        for (Iterator<String> it = commands.iterator(); it.hasNext();) {
            String command = it.next();

            if (command.equals("\\\\") || command.equals("\\{") || command.equals("\\[") || command.equals("\\|")) {
                // continue;
                command = "\\" + command;
            }

            StringBuffer sb = new StringBuffer();
            String find = "\\" + command;

            try {
                Pattern p = Pattern.compile(find);
                Matcher m = p.matcher(par);
                while (m.find()) {
                    String replace = "<u" + String.valueOf(counter) + ">";
                    String[] subst = { reHarden(m.group(0)), reHarden(replace) };
                    substituted.addFirst(subst);
                    m.appendReplacement(sb, replace);
                    counter++;
                }
                m.appendTail(sb);

                par = sb.toString();
             } catch (java.util.regex.PatternSyntaxException e) {
               //TODO: understand the exceptions
               Log.log("LaTeX PatternSyntaxException: " + e.getMessage());
               Log.log(command);
            }

        }
        return par;
    }

    private String reHarden(String re) {
        re = re.replaceAll("\\\\", "\\\\\\\\"); // replace \ with \\
        re = re.replaceAll("\\[", "\\\\[");
        re = re.replaceAll("\\^", "\\\\^");
        re = re.replaceAll("\\$", "\\\\\\$");
        re = re.replaceAll("\\{", "\\\\{");
        return re;
    }

    private String processParagraph(LinkedList<String> commands, String par) {
        LinkedList<String[]> substituted = new LinkedList<String[]>();

        par = substituteUnicode(par);

        par = replaceOneArgNoText(substituted, commands, par);
        par = replaceOneArgInlineText(substituted, commands, par);
        par = replaceOneArgParText(substituted, commands, par);
        par = replaceUnknownCommand(substituted, commands, par);

        String find = ("^((\\s*</?[nipu]\\d+>\\s*)*)" + "(.*?)" + "((\\s*</?[nipu]\\d+>\\s*)*)$");
        Pattern p = Pattern.compile(find);
        Matcher m = p.matcher(par);
        if (m.find()) {
            par = "";
            if (m.group(1) != null)
                par += m.group(1);
            if (m.group(3) != null)
                par += processEntry(m.group(3));
            if (m.group(4) != null)
                par += m.group(4);
        }

        par = resubstituteTex(par);

        ListIterator<String[]> it = substituted.listIterator();
        while (it.hasNext()) {
            String[] subst = it.next();
            par = par.replaceAll(subst[1], subst[0]);
        }

        return par;
    }

}