package org.juxtasoftware.diff;
import java.io.IOException;
import java.net.URI;
import java.util.List;
import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.collect.Iterables;
import com.google.common.collect.Ordering;
import difflib.Delta;
import difflib.DiffUtils;
import difflib.Patch;
import eu.interedition.text.Annotation;
import eu.interedition.text.Name;
import eu.interedition.text.Range;
import eu.interedition.text.Text;
import eu.interedition.text.mem.SimpleAnnotation;
import eu.interedition.text.mem.SimpleName;
import eu.interedition.text.util.Annotations;
/**
*
* Perform a diff upon two comperands and tansform results into alignments. Diff is done
* using google diff-utils and the meyers diff algorithm.
*
*/
public class DiffCollator {
protected static final Name GAP_NAME = new SimpleName(URI.create("http://juxtasoftware.org/ns"), "gap");
protected static final Logger LOG = LoggerFactory.getLogger(DiffCollator.class);
protected boolean transpositionCollation;
public void collate(DiffCollatorConfiguration config, Comparand base, Comparand witness) throws IOException {
// This amounts to a config for the diff/collation
Comparison comparison = new Comparison(base, witness);
// First find an filter out any transpositions and collate
// the filtered result
final Set<Set<Annotation>> transpositions = config.getTranspositionSource().transpositionsIn(comparison);
for (Comparison filtered : comparison.filter(transpositions)) {
LOG.info("Collating " + filtered);
this.transpositionCollation = false;
collate(config, filtered);
}
// Now take each of the transpositions and perform a
// mini-collation on it
for (Set<Annotation> transposition : transpositions) {
LOG.info("Collating transposition " + Iterables.toString(transposition));
this.transpositionCollation = true;
collate(config, new Comparison(base, witness, transposition));
}
Runtime.getRuntime().gc();
Runtime.getRuntime().runFinalization();
}
protected void collate(DiffCollatorConfiguration config, Comparison collation) throws IOException {
// Pull ALL tokens for base and witness into memory.
// The token inclides annotation info (range, qname, etc) plus
// notmalized text based on settings (no case, punctiuation stripped, etc)
final TokenSource tokenSource = config.getTokenSource();
final Text base = collation.getBase().getText();
final Text witness = collation.getWitness().getText();
final List<Token> baseTokens = tokenSource.tokensOf(base, collation.getBaseRanges());
final List<Token> witnessTokens = tokenSource.tokensOf(witness, collation.getWitnessRanges());
if (baseTokens.isEmpty() && witnessTokens.isEmpty()) {
return;
}
// Do the diff!
Patch diffResult = DiffUtils.diff(baseTokens, witnessTokens);
// Convert the dif results into differences
DifferenceStore differenceStore = config.getDifferenceStore();
int baseTokenIndex = 0;
int witnessTokenIndex = 0;
int diffSequece = 0;
// Each diff may contain several contiguous tokens that make
// up the total diff. Assign each diff a sequence number so
// the token-based diffs can be grouped back into contiguous
// runs of text later
for ( Delta diff : diffResult.getDeltas() ) {
diffSequece++;
// grab references to diff token indexes for base (original) and witness (revised)
int baseDiffTokenStartIndex = diff.getOriginal().getPosition();
int baseDiffTokenEndIndex = baseDiffTokenStartIndex + diff.getOriginal().getLines().size();
int witnessDiffTokenStartIndex = diff.getRevised().getPosition();
int witnessDiffTokenEndIndex = witnessDiffTokenStartIndex + diff.getRevised().getLines().size();
do {
// curr indexes are before change - these are aligned, just skip over them
if ( baseTokenIndex < baseDiffTokenStartIndex && witnessTokenIndex < witnessDiffTokenStartIndex) {
baseTokenIndex++;
witnessTokenIndex++;
continue;
}
// curr indexes are both within change bounds... mark as change - no gaps
if ( baseTokenIndex < baseDiffTokenEndIndex && witnessTokenIndex < witnessDiffTokenEndIndex) {
Token baseToken = baseTokens.get(baseTokenIndex++);
Token witnessToken = witnessTokens.get(witnessTokenIndex++);
differenceStore.add( createDifference(diffSequece, Difference.Type.CHANGE,
baseToken.getAnnotation(), baseToken,
witnessToken.getAnnotation(), witnessToken));
continue;
}
// Base still within change range, witnesss not. Introduce a gap in the witness
if ( baseTokenIndex < baseDiffTokenEndIndex && witnessTokenIndex >= witnessDiffTokenEndIndex) {
Token baseToken = baseTokens.get(baseTokenIndex++);
Annotation witnessGap = createGap( witness, witnessTokenIndex, witnessTokens );
differenceStore.add( createDifference(diffSequece, Difference.Type.ADD_DEL,
baseToken.getAnnotation(), baseToken, witnessGap, null));
continue;
}
// WITNESS still within change range, base not. Introduce a gap in the base
if ( baseTokenIndex >= baseDiffTokenEndIndex && witnessTokenIndex < witnessDiffTokenEndIndex) {
Token witnessToken = witnessTokens.get(witnessTokenIndex++);
Annotation baseGap = createGap( base, baseTokenIndex, baseTokens );
differenceStore.add( createDifference(diffSequece, Difference.Type.ADD_DEL,
baseGap, null, witnessToken.getAnnotation(), witnessToken));
continue;
}
} while ( baseTokenIndex < baseDiffTokenEndIndex || witnessTokenIndex < witnessDiffTokenEndIndex );
}
}
private Annotation createGap(Text comparandText, int currIndex, List<Token> tokens) throws IOException {
if ( currIndex == 0 ) {
if ( this.transpositionCollation == false ) {
return new GapAnnotation(comparandText, 0);
}
return gap( tokens.get(0).getAnnotation() );
} else {
return gap( tokens.get(currIndex-1).getAnnotation() );
}
}
private Difference createDifference(int diffSequece, Difference.Type type, Annotation base, Token baseToken, Annotation witness, Token witnessToken) {
int editDistance = 0;
if (type.equals(Difference.Type.CHANGE) ) {
editDistance = baseToken.editDistanceTo(witnessToken);
} else if (base.getRange().length() == 0 || witness.getRange().length() == 0) {
editDistance = -1;
}
return new Difference(diffSequece, type, base, witness, editDistance);
}
protected Annotation gap(Annotation prev) throws IOException {
if (prev instanceof GapAnnotation) {
return prev;
} else {
return new GapAnnotation(prev.getText(), prev.getRange().getEnd());
}
}
public static class GapAnnotation extends SimpleAnnotation {
private final long offset;
public GapAnnotation(Text text, long offset) {
super(text, GAP_NAME, new Range(offset, offset), null);
this.offset = offset;
}
public long getOffset() {
return offset;
}
@Override
public Text getText() {
return text;
}
@Override
public Name getName() {
return GAP_NAME;
}
@Override
public Range getRange() {
return new Range(offset, offset);
}
@Override
public String toString() {
return "GAP "+text.toString()+" "+getRange().toString();
}
@Override
public int compareTo(Annotation o) {
return Annotations.compare(this, o).compare(this, o, Ordering.arbitrary()).result();
}
public static final Function<Annotation,Long> TO_OFFSET = new Function<Annotation, Long>() {
@Override
public Long apply(Annotation input) {
Preconditions.checkArgument(input.getRange().length() == 0);
return input.getRange().getEnd();
}
};
}
}