package edu.berkeley.cs.nlp.ocular.eval; import java.util.ArrayList; import java.util.List; /** * Implementation of edit distance that supports two nontrivial operations: * * a) Switching costs for going from EQUAL to something else or something else to EQUAL. * We do not charge for beginning or ending the edit sequence with insertions. * * b) Position-specific edit costs for the equality and substitution operations * * We need to expand the DP state to do this. Our DP state is actually quite general * and could store arbitrary other information; this means that it's somewhat slower * than it needs to be because it's object-heavy, but this isn't the bottleneck of our * algorithm and this generality allows possible other extensions to edit distance. * * @author Greg Durrett (gdurrett@cs.berkeley.edu) */ public class MarkovEditDistanceComputer { /** * Stores parameters for the edit distance operation. * Also stores the things being aligned because some parameters are anchored * to these, such as the fancy equality and substitute costs. */ public static class EditDistanceParams { public final Form src; public final Form trg; public final double[] equalCosts; public final double[] substCosts; public final double insertCost; public final double deleteCost; public final boolean allowFSConfusion; public EditDistanceParams(Form src, Form trg, double[] equalCosts, double[] substCosts, double insertCost, double deleteCost, boolean allowFSConfusion) { this.src = src; this.trg = trg; this.equalCosts = equalCosts; this.substCosts = substCosts; this.insertCost = insertCost; this.deleteCost = deleteCost; this.allowFSConfusion = allowFSConfusion; } public static EditDistanceParams getStandardParams(Form src, Form trg, boolean allowFSConfusion) { return new EditDistanceParams(src, trg, populateArr(0, src.length()), populateArr(1, src.length()), 1, 1, allowFSConfusion); } public static double[] populateArr(double val, int len) { double[] arr = new double[len]; for (int i = 0; i < len; i++) { arr[i] = val; } return arr; } } /** * State for the Viterbi forward pass through the edit distance lattice to compute backward costs. */ public static class ForwardSearchState { public final int srcIndex; public final int trgIndex; public final double viterbiBackwardCost; public final ForwardSearchState viterbiBackptr; public ForwardSearchState(int srcIndex, int trgIndex, double viterbiBackwardCost, ForwardSearchState viterbiBackptr) { this.srcIndex = srcIndex; this.trgIndex = trgIndex; this.viterbiBackwardCost = viterbiBackwardCost; this.viterbiBackptr = viterbiBackptr; } } private final EditDistanceParams params; // Indices are src index, trg index, and previous operations. private ForwardSearchState[][] chart; public MarkovEditDistanceComputer(EditDistanceParams params) { this.params = params; this.chart = new ForwardSearchState[params.src.length() + 1][params.trg.length() + 1]; } /** * @param op * @param state * @return The cost to apply the given operator to the given state. */ private double costToApply(Operation op, ForwardSearchState state) { if (!isLegalToApply(op, state)) { throw new RuntimeException("Illegal operation; applying " + op + " to " + state.srcIndex + ", " + state.trgIndex + " of " + params.src + "-" + params.trg); } double cost = 0; if (op == Operation.INSERT) { cost += params.insertCost; } else if (op == Operation.DELETE) { cost += params.deleteCost; } else if (op == Operation.SUBST) { cost += params.substCosts[state.srcIndex]; } else if (op == Operation.EQUAL) { cost += params.equalCosts[state.srcIndex]; } return cost; } /** * @param op * @param state * @return True if it is legal to apply the given operation to the given state. * Checks bounds and conditions for equal vs. substitute */ private boolean isLegalToApply(Operation op, ForwardSearchState state) { boolean roomOnSrc = state.srcIndex < params.src.length(); boolean roomOnTrg = state.trgIndex < params.trg.length(); if (op == Operation.INSERT) { return roomOnTrg; } else if (op == Operation.DELETE) { return roomOnSrc; } else { // EQUAL or SUBST must have room on both sides if (!roomOnSrc || !roomOnTrg) { return false; } // Now check that EQUAL applies only to equal characters and SUBST only to // unequal characters Glyph srcGlyph = params.src.charAt(state.srcIndex); Glyph trgGlyph = params.trg.charAt(state.trgIndex); boolean charsEq = srcGlyph.equals(trgGlyph); // Allow permissible confusions with zero cost if (params.allowFSConfusion && !charsEq) { // Some optimization... int srcGlyphLength = srcGlyph.glyph.length(); int trgGlyphLength = trgGlyph.glyph.length(); if (srcGlyphLength == trgGlyphLength) { if (srcGlyphLength == 1) { charsEq = srcGlyph.glyph.equals("f") && trgGlyph.glyph.equals("s"); } else { Glyph newSrc = new Glyph(srcGlyph.glyph.replaceAll("f", "*").replaceAll("s", "*")); Glyph newTrg = new Glyph(trgGlyph.glyph.replaceAll("s", "*")); charsEq = newSrc.equals(newTrg); } } } return (op == Operation.EQUAL && charsEq) || (op == Operation.SUBST && !charsEq); } } /** * @param op * @param state * @return A new state produced by applying op to the given state, or null * if op cannot be legally applied here */ private ForwardSearchState apply(Operation op, ForwardSearchState state) { if (!isLegalToApply(op, state)) { return null; } int newSrcIndex = state.srcIndex; int newTrgIndex = state.trgIndex; if (op == Operation.EQUAL || op == Operation.SUBST) { newSrcIndex++; newTrgIndex++; } else if (op == Operation.INSERT) { newTrgIndex++; } else if (op == Operation.DELETE) { newSrcIndex++; } double costDelta = costToApply(op, state); return new ForwardSearchState(newSrcIndex, newTrgIndex, state.viterbiBackwardCost + costDelta, state); } /** * Does the forward pass, computing Viterbi backwards scores for each state. */ private void forwardPass() { chart[0][0] = new ForwardSearchState(0, 0, 0, null); // Loop over chart cells for (int srcIndex = 0; srcIndex < params.src.length() + 1; srcIndex++) { if (params.src.length() > 10000 && srcIndex != 0 && srcIndex % 500 == 0) { System.out.println("Edit distance working...on srcIndex " + srcIndex + " / " + params.src.length()); } for (int trgIndex = 0; trgIndex < params.trg.length() + 1; trgIndex++) { ForwardSearchState prevState = chart[srcIndex][trgIndex]; if (prevState == null) { continue; } // Loop over operations that could be applied to the given cell for (int opIndex = 0; opIndex < Operation.values().length; opIndex++) { Operation currOp = Operation.values()[opIndex]; // Produce the result of applying the operation and insert it into the chart as appropriate ForwardSearchState result = apply(currOp, prevState); if (result != null) { ForwardSearchState currEntry = chart[result.srcIndex][result.trgIndex]; if (currEntry == null || result.viterbiBackwardCost < currEntry.viterbiBackwardCost) { chart[result.srcIndex][result.trgIndex] = result; } } } } } } /** * Moves back through the chart and extracts the one-best solution. * @return The forms being aligned here and their one-best alignment. */ private AlignedFormPair backwardPass() { ForwardSearchState currState = chart[params.src.length()][params.trg.length()]; if (currState == null) { throw new RuntimeException("Edit distance returned nothing for " + params.src + "-" + params.trg); } double cost = currState.viterbiBackwardCost; List<Operation> ops = new ArrayList<Operation>(); // Until we hit the first state, accrue the edit ops (which come in reverse order) while (currState.viterbiBackptr != null) { // Figure out which operation was used int thisSrcIdx = currState.srcIndex; int thisTrgIdx = currState.trgIndex; int prevSrcIdx = currState.viterbiBackptr.srcIndex; int prevTrgIdx = currState.viterbiBackptr.trgIndex; Operation op; if (prevSrcIdx == thisSrcIdx) { op = Operation.INSERT; } else if (prevTrgIdx == thisTrgIdx) { op = Operation.DELETE; } else { if (params.src.charAt(prevSrcIdx).equals(params.trg.charAt(prevTrgIdx))) { op = Operation.EQUAL; } else { op = Operation.SUBST; } } ops.add(0, op); currState = currState.viterbiBackptr; } return new AlignedFormPair(params.src, params.trg, ops, cost); } public AlignedFormPair runEditDistance() { if (params.src.length() > 10000) { System.out.println("Running edit distance with source length " + params.src.length() + ", for src length 7000 takes 30 seconds and 1+GB of memory"); } forwardPass(); return backwardPass(); } }