// Copyright 2014 Thomas Müller // This file is part of MarMoT, which is licensed under GPLv3. package marmot.tokenize.openlp; import java.util.LinkedList; import java.util.List; import marmot.util.LevenshteinLattice; import marmot.util.StringUtils; public class LevenshteinAligner implements Aligner { private long timeout_ = 1000; public LevenshteinAligner() { this(1000); } public LevenshteinAligner(long timeout) { timeout_ = timeout; } private static final String TIMEOUT_STRING = "<TIMEOUT>"; private static final List<Character> TIMEOUT_LIST = new LinkedList<Character>(); static { for (int i=TIMEOUT_STRING.length()- 1; i>=0; i--) { TIMEOUT_LIST.add(TIMEOUT_STRING.charAt(i)); } } class SpecialLevenshteinLattice extends LevenshteinLattice { private long timeout_; public SpecialLevenshteinLattice(String input, String output, long timeout) { super(input, output, 2, 2, 3); timeout_ = timeout; } @Override protected int getReplaceCost(char input, char output) { if (output == ' ' || input == ' ') { return 1000; } return super.getReplaceCost(input, output); } class State { int input_index; int output_index; List<Character> current_path; public State getNewState(char op, int input_diff, int output_diff) { State state = new State(); state.input_index = input_index + input_diff; state.output_index = output_index + output_diff; state.current_path = new LinkedList<Character>(current_path); state.current_path.add(op); return state; } } @Override public String searchOperationSequence() { init(); State state = new State(); state.input_index = input_.length(); state.output_index = output_.length(); state.current_path = new LinkedList<Character>(); List<State> states = new LinkedList<State>(); states.add(state); List<Character> seq = searchOperationSequence(states); if (seq == null) { return null; } StringBuilder sb = new StringBuilder(seq.size()); for (char c : seq) { sb.append(c); } sb.reverse(); return sb.toString(); } private List<Character> searchOperationSequence(List<State> states) { long time = System.currentTimeMillis(); while (!states.isEmpty()) { State state = states.remove(0); long current_time = System.currentTimeMillis(); if (current_time - time > timeout_) { return TIMEOUT_LIST; } short op = op_lattice_[state.input_index][state.output_index]; if ((op & START) > 0) { assert op == START; return state.current_path; } if ((op & COPY) > 0) { states.add(state.getNewState('C', -1, -1)); } if ((op & REPLACE) > 0) { states.add(state.getNewState('R', -1, -1)); } if ((op & INSERT) > 0) { List<Character> current_path = state.current_path; if (output_.charAt(state.output_index - 1) == ' ') { states.add(state.getNewState('I', 0, -1)); } else { if (current_path.size() > 0) { char last_op = current_path .get(current_path.size() - 1); char last_char = output_.charAt(state.output_index); if ((last_op == 'I' && last_char != ' ') || last_op == 'R') { states.add(state.getNewState('I', 0, -1)); } } } } if ((op & DELETE) > 0) { List<Character> current_path = state.current_path; if (input_.charAt(state.input_index - 1) == ' ') { states.add(state.getNewState('D', -1, 0)); } else { if (current_path.size() > 0) { char last_op = current_path .get(current_path.size() - 1); char last_char = input_.charAt(state.input_index); if ((last_op == 'D' && last_char != ' ') || last_op == 'R') { states.add(state.getNewState('D', -1, 0)); } } } } } return null; } } @Override public Result align(String input, String output) { input = StringUtils.clean(input); output = StringUtils.clean(output); SpecialLevenshteinLattice lattice = new SpecialLevenshteinLattice( input, output, timeout_); String operations = lattice.searchOperationSequence(); if (operations == null) { return new Result(ResultType.NoAlignmentFound); } if (operations.equals(TIMEOUT_STRING)) { return new Result(ResultType.Timeout); } List<Pair> pairs = new LinkedList<Pair>(); int input_index = 0; int output_index = 0; for (int i = 0; i < operations.length(); i++) { char op = operations.charAt(i); switch (op) { case 'C': //System.err.format("c(%c,%c)\n", input.charAt(input_index), // output.charAt(output_index)); pairs.add(new Pair(input_index, output_index)); input_index++; output_index++; break; case 'R': //System.err.format("r(%c,%c)\n", input.charAt(input_index), // output.charAt(output_index)); pairs.add(new Pair(input_index, output_index)); input_index++; output_index++; break; case 'D': if (input.charAt(input_index) == ' ') { pairs.add(new Pair(input_index, - 1)); } else { pairs.add(new Pair(input_index, output_index)); } input_index++; break; case 'I': //System.err.format("i(%c)\n", output.charAt(output_index)); if (output.charAt(output_index) == ' ') { pairs.add(new Pair(-1, output_index)); } else { pairs.add(new Pair(input_index, output_index)); } output_index++; break; } } return new Result(ResultType.Standard, pairs); } }