/******************************************************************************* * Copyright (c) 2015 EclipseSource GmbH and others. * All rights reserved. This program and the accompanying materials * are made available under the terms of the Eclipse Public License v1.0 * which accompanies this distribution, and is available at * http://www.eclipse.org/legal/epl-v10.html * * Contributors: * Michael Borkowski - initial API and implementation *******************************************************************************/ package org.eclipse.emf.compare.ide.ui.internal.logical; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.LinkedList; import org.eclipse.emf.compare.internal.dmp.LineBasedDiff; import org.eclipse.emf.compare.internal.dmp.diff_match_patch; import org.eclipse.emf.compare.internal.dmp.diff_match_patch.Diff; /** * This class is responsible for computing similarities between two text contents and deciding whether they * are close enough to be considered a rename. * * @author Michael Borkowski <mborkowski@eclipsesource.com> */ public final class SimilarityComputer { /** * The minimum length both sides must have to not be ignored (short text contents might seem similar and * cause false negatives). */ public static final int MINIMUM_LENGTH = 1024; /** * The maximum percentage of differing lines contained in the content for files to be considered a rename. */ public static final double THRESHOLD = 0.3; /** * Private constructor to prevent instantiation. */ private SimilarityComputer() { } /** * Decides whether two input streams are similar. This methods calls * {@link #computeDifference(InputStream, InputStream)} and compares the value with {@link #THRESHOLD}. * * @param a * the first input stream * @param b * the second input stream * @return <code>true</code> if the input streams are similar * @throws IOException * if reading of one of the input streams fails */ public static boolean isSimilar(InputStream a, InputStream b) throws IOException { return computeDifference(a, b) < THRESHOLD; } /** * Computes the difference between two {@link InputStream} instances. The returned value is a ratio of * changed lines to total lines, where total lines is denoted by the maximum of the line counts of both * input streams. This method returns {@link Double#MAX_VALUE} if one or both of the streams are * <code>null</code> or if the content is too short to be compared (shorter than {@link #MINIMUM_LENGTH}). * * @param a * the first input stream * @param b * the second input stream * @return how different the two streams are * @throws IOException * if reading of one of the input streams fails */ public static double computeDifference(InputStream a, InputStream b) throws IOException { if (a == null || b == null) { return Double.MAX_VALUE; } try { LineFile fileA; LineFile fileB; // even though the file might not be encoded in UTF-8, decoding both in UTF-8 should not harm the // similarity function fileA = readUtf8(a); fileB = readUtf8(b); return internalCalculateSimilarity(fileA, fileB); } finally { try { a.close(); } catch (IOException ignored) { // ignore } try { b.close(); } catch (IOException ignored) { // ignore } } } /** * Internal method for calculating similarity, without checks. * * @param a * the first file * @param b * the second file * @return the similarity */ private static double internalCalculateSimilarity(LineFile a, LineFile b) { if (a.characterCount < MINIMUM_LENGTH || b.characterCount < MINIMUM_LENGTH) { return Double.MAX_VALUE; } final LineBasedDiff lineBasedDiff = new LineBasedDiff(); long differences = 0; final LinkedList<Diff> diffs = lineBasedDiff.diff_main(a.content, b.content, false); for (Diff diff : diffs) { if (diff.operation != diff_match_patch.Operation.EQUAL) { differences++; } } return (double)differences / Math.max(a.lineCount, b.lineCount); } /** * Reads the stream as a UTF-8 encoded stream. * * @param stream * the stream to read from * @return the resulting string * @throws IOException * if reading fails */ private static LineFile readUtf8(InputStream stream) throws IOException { final BufferedReader reader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); //$NON-NLS-1$ final StringBuilder builder = new StringBuilder(); final LineFile result = new LineFile(); String rd; while ((rd = reader.readLine()) != null) { builder.append(rd); builder.append('\n'); result.lineCount++; result.characterCount += rd.length() + 1; } result.content = builder.toString(); return result; } /** * An auxiliary data structure for internally representing a file. * * @author mborkowski */ private static class LineFile { /** * The character count of the file. */ long characterCount; /** * The line count of the file. */ long lineCount; /** * The contents of the file. */ String content; } }