/************************************************************************** OmegaT - Computer Assisted Translation (CAT) tool with fuzzy matching, translation memory, keyword search, glossaries, and translation leveraging into updated projects. Copyright (C) 2012 Aaron Madlon-Kay 2013 Zoltan Bartko 2015 Aaron Madlon-Kay Home page: http://www.omegat.org/ Support center: http://groups.yahoo.com/group/OmegaT/ This file is part of OmegaT. OmegaT is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. OmegaT is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. **************************************************************************/ package org.omegat.core.matching; import java.util.LinkedList; import java.util.List; import java.util.regex.Pattern; import org.omegat.core.Core; import org.omegat.tokenizer.ITokenizer; import bmsi.util.Diff; /** * Drives a diff engine to produce rendered textual diff output. * Uses GNU Diff for Java: http://bmsi.com/java/#diff * * @author Aaron Madlon-Kay */ public final class DiffDriver { private DiffDriver() { } public enum Type { INSERT, DELETE, NOCHANGE } public static final Pattern DIFF_MERGEABLE_DELIMITER_PATTERN = Pattern.compile("[ :;,.()]+"); /** * Given two strings, perform a diff comparison and return a Render object. * * @param original Original string * @param revised Revised string for comparison * @return Render object */ public static Render render(String original, String revised, boolean optimize) { Render result = new Render(); ITokenizer tokenizer = Core.getProject().getSourceTokenizer(); if (tokenizer == null) { // Project has probably been closed. return result; } String[] originalStrings = tokenizer.tokenizeVerbatimToStrings(original); String[] revisedStrings = tokenizer.tokenizeVerbatimToStrings(revised); if (originalStrings == null || revisedStrings == null) { return result; } // Get "change script", a linked list of Diff.changes. Diff diff = new Diff(originalStrings, revisedStrings); Diff.change script = diff.diff_2(false); assert (validate(script, originalStrings, revisedStrings)); StringBuilder rawText = new StringBuilder(); // Walk original token strings past the last index in // case there was an insertion at the end. for (int n = 0; n <= originalStrings.length; n++) { Diff.change c = search(n, script); if (c == null) { // No change for this token. if (n < originalStrings.length) { if (optimize) { result.addRun(rawText.length(), originalStrings[n].length(), Type.NOCHANGE); } rawText.append(originalStrings[n]); } continue; } else { // Next time, start search from the next change. script = c.link; } // Handle deletions if (c.deleted > 0) { int start = rawText.length(); //rawText.append("-["); for (int m = 0; m < c.deleted; m++) { rawText.append(originalStrings[n + m]); } //rawText.append("]"); n += c.deleted - 1; result.addRun(start, rawText.length() - start, Type.DELETE); } // Handle insertions if (c.inserted > 0) { int start = rawText.length(); //rawText.append("+["); for (int m = 0; m < c.inserted; m++) { rawText.append(revisedStrings[c.line1 + m]); } //rawText.append("]"); result.addRun(start, rawText.length() - start, Type.INSERT); // If this was an insert only (no deleted lines), we should // add the original token in as well. if (c.deleted == 0 && n < originalStrings.length) { if (optimize) { result.addRun(rawText.length(), originalStrings[n].length(), Type.NOCHANGE); } rawText.append(originalStrings[n]); } } } result.text = rawText.toString(); if (optimize) { Render optimized = optimizeRender(result, 0); return (optimized.formatting.size() < result.formatting.size()) ? optimized : result; } else { return result; } } private static Render optimizeRender(Render render, int level) { if (level > 3) { return render; } StringBuilder rawText = new StringBuilder(); Render result = new Render(); List<TextRun> fList = render.formatting; // try to merge <deletion><insertion><space><deletion><insertion> patterns if (fList.size() < 5) { return render; } for (int i = 0; i < fList.size(); i++) { TextRun r0 = fList.get(i); if (i < fList.size() - 4) { TextRun r1 = fList.get(i + 1); TextRun r2 = fList.get(i + 2); TextRun r3 = fList.get(i + 3); TextRun r4 = fList.get(i + 4); if (r0.type == Type.DELETE && r1.type == Type.INSERT && r2.type == Type.NOCHANGE && DIFF_MERGEABLE_DELIMITER_PATTERN.matcher( render.text.substring(r2.start, r2.start + r2.length)).matches() && r3.type == Type.DELETE && r4.type == Type.INSERT ) { StringBuilder buff = new StringBuilder(); //merge deletes buff.append(render.getRunText(r0)); buff.append(render.getRunText(r2)); buff.append(render.getRunText(r3)); result.addRun(rawText.length(), buff.length(), Type.DELETE); rawText.append(buff); buff.delete(0, buff.length()); //merge inserts buff.append(render.getRunText(r1)); buff.append(render.getRunText(r2)); buff.append(render.getRunText(r4)); result.addRun(rawText.length(), buff.length(), Type.INSERT); rawText.append(buff); i = i + 4; continue; } } result.addRun(rawText.length(), r0.length, r0.type); rawText.append(render.getRunText(r0)); } result.text = rawText.toString(); Render optimized = optimizeRender(result, level + 1); return (optimized.formatting.size() < result.formatting.size()) ? optimized : result; } /** * Recurse through a change script until we find a change at the given index. * * @param i Index to seek * @param script Change script * @return Element at index i, or null if not found */ private static Diff.change search(int i, Diff.change script) { // Give up when we reach the end of the list, // OR if we've passed the desired index (list is sorted in increasing order). if (script == null || script.line0 > i) { return null; } if (script.line0 == i) { return script; } return search(i, script.link); } /** * Double check some assumptions made about change scripts. Only meant to be called in debug, via assert. * * @param script * Linked list of Diff.change elements * @param original * Original strings * @param revised * Revised strings * @return Whether or not the change script is valid */ private static boolean validate(Diff.change script, String[] original, String[] revised) { Diff.change prev = null; for (Diff.change c = script; c != null; c = c.link) { // Script is sorted in increasing order of string line number. if (prev != null && (c.line0 <= prev.line0 || c.line1 <= prev.line1)) { return false; } // All changes will be accounted for by walking c.line0 in range [0, original.length]. if (c.line0 < 0 || c.line0 > original.length) { return false; } prev = c; } return true; } /** * Represents the output of a string-string diff comparison. Contains the raw text for display, as well as * formatting information. * * @author aaron.madlon-kay */ public static class Render { public List<TextRun> formatting = new LinkedList<TextRun>(); public String text; public void addRun(int start, int length, Type type) { formatting.add(new TextRun(start, length, type)); } /** * Get the text corresponding to the run * @param run * @return */ public String getRunText(TextRun run) { return text.substring(run.start, run.start + run.length); } } /** * Indicates formatting of a text run for diff display purposes. * * @author aaron.madlon-kay */ public static class TextRun { public int start; public int length; public Type type; public TextRun(final int start, final int length, final Type type) { assert (start >= 0); assert (length >= 1); assert (type != null); this.start = start; this.length = length; this.type = type; } @Override public String toString() { return String.format("%s: %d +%d", type, start, length); } } }