package com.bericotech.clavin.util;
import static com.bericotech.clavin.util.DamerauLevenshtein.damerauLevenshteinDistance;
import static com.bericotech.clavin.util.DamerauLevenshtein.damerauLevenshteinDistanceCaseInsensitive;
import static com.bericotech.clavin.util.DamerauLevenshtein.isEditDistance1;
import static org.junit.Assert.*;
import java.math.BigInteger;
import java.util.Random;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.bericotech.clavin.GeoParser;
/*#####################################################################
*
* CLAVIN (Cartographic Location And Vicinity INdexer)
* ---------------------------------------------------
*
* Copyright (C) 2012-2013 Berico Technologies
* http://clavin.bericotechnologies.com
*
* ====================================================================
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*
* ====================================================================
*
* DamerauLevenshteinTest.java
*
*###################################################################*/
/**
* Some tests designed to put the Damerau-Levenshtein utilities through
* their paces.
*
*/
public class DamerauLevenshteinTest {
private static final Logger logger = LoggerFactory.getLogger(GeoParser.class);
/**
* Simple tests to make sure we're getting the correct edit distance
* for all the different edit operations, and a "smoke test" to
* ensure we're not crashing on unexpected input.
*/
@Test
public void testDamerauLevenshteinDistance() {
String a = null; String b = null;
assertEquals("both null", 0, damerauLevenshteinDistance(a, b));
a = null; b = "y";
assertEquals("first null", 1, damerauLevenshteinDistance(a, b));
a = "x"; b = null;
assertEquals("second null", 1, damerauLevenshteinDistance(a, b));
a = ""; b = "";
assertEquals("both empty", 0, damerauLevenshteinDistance(a, b));
a = ""; b = "y";
assertEquals("first empty", 1, damerauLevenshteinDistance(a, b));
a = "x"; b = "";
assertEquals("second empty", 1, damerauLevenshteinDistance(a, b));
a = "x"; b = "x";
assertEquals("same", 0, damerauLevenshteinDistance(a, b));
a = "x"; b = "y";
assertEquals("message", 1, damerauLevenshteinDistance(a, b));
a = "xy"; b = "x";
assertEquals("substitution", 1, damerauLevenshteinDistance(a, b));
a = "x"; b = "xy";
assertEquals("deletion", 1, damerauLevenshteinDistance(a, b));
a = "xy"; b = "yx";
assertEquals("transposition", 1, damerauLevenshteinDistance(a, b));
a = "xyz"; b = "yzx";
assertEquals("adjacent transpositions", 2, damerauLevenshteinDistance(a, b));
a = "xyz"; b = "zx";
assertEquals("editing a substring more than once", 2, damerauLevenshteinDistance(a, b));
// generate 100k pairs of random strings & make sure we can handle them
Random random = new Random();
for (int i = 0; i < 100000; i++) {
a = new BigInteger(130, random).toString(32);
b = new BigInteger(130, random).toString(32);
assertTrue("smoke test: " + a + " vs " + b, damerauLevenshteinDistance(a, b) > -1);
}
}
/**
* Ensure the case-insensitive version of DamerauLevenshteinDistance() maintains
* "triangle equality" between strings that are identical except for case.
*/
@Test
public void testDamerauLevenshteinDistanceCaseInsensitive() {
String a = "Berico Technologies"; String b = "BERICO TECHNOLOGIES"; String c = "bErIcO tEcHnOlOgIeS";
assertEquals("unwanted case sensitivity", 0, damerauLevenshteinDistanceCaseInsensitive(a, b));
assertEquals("unwanted case sensitivity", 0, damerauLevenshteinDistanceCaseInsensitive(a, c));
assertEquals("unwanted case sensitivity", 0, damerauLevenshteinDistanceCaseInsensitive(b, c));
}
/**
* Some simple tests to ensure we're getting correct output for
* various combinations of edit operations, plus a "smoke test" to
* ensure we're keeping consistent with damerauLevenshteinDistance().
*/
@Test
public void testIsEditDistance1() {
String a = null; String b = null;
assertTrue("both null", isEditDistance1(a, b));
a = null; b = "y";
assertTrue("first null", isEditDistance1(a, b));
a = "x"; b = null;
assertTrue("second null", isEditDistance1(a, b));
a = ""; b = "";
assertTrue("both empty", isEditDistance1(a, b));
a = ""; b = "y";
assertTrue("first empty", isEditDistance1(a, b));
a = "x"; b = "";
assertTrue("second empty", isEditDistance1(a, b));
a = null; b = "xyz";
assertFalse("first null, second long", isEditDistance1(a, b));
a = "xyz"; b = null;
assertFalse("second null, first long", isEditDistance1(a, b));
a = "x"; b = "x";
assertTrue("short same", isEditDistance1(a, b));
a = "Berico Technologies"; b = "Berico Technologies";
assertTrue("long same", isEditDistance1(a, b));
a = "x"; b = "y";
assertTrue("simple substitution", isEditDistance1(a, b));
a = "xzxcvbnm"; b = "yzxcvbnm";
assertTrue("beginning substitution", isEditDistance1(a, b));
a = "zxcvbnmxzxcvbnm"; b = "zxcvbnmyzxcvbnm";
assertTrue("middle substitution", isEditDistance1(a, b));
a = "zxcvbnmx"; b = "zxcvbnmy";
assertTrue("ending substitution", isEditDistance1(a, b));
a = "x"; b = "xy";
assertTrue("simple addition", isEditDistance1(a, b));
a = "xzxcvbnm"; b = "xyzxcvbnm";
assertTrue("beginning addition", isEditDistance1(a, b));
a = "zxcvbnmxzxcvbnm"; b = "zxcvbnmxyzxcvbnm";
assertTrue("middle addition", isEditDistance1(a, b));
a = "zxcvbnmx"; b = "zxcvbnmxy";
assertTrue("ending addition", isEditDistance1(a, b));
a = "xy"; b = "x";
assertTrue("simple deletion", isEditDistance1(a, b));
a = "xyzxcvbnm"; b = "xzxcvbnm";
assertTrue("beginning deletion", isEditDistance1(a, b));
a = "zxcvbnmxyzxcvbnm"; b = "zxcvbnmxzxcvbnm";
assertTrue("middle deletion", isEditDistance1(a, b));
a = "zxcvbnmxy"; b = "zxcvbnmx";
assertTrue("ending deletion", isEditDistance1(a, b));
a = "xy"; b = "yx";
assertTrue("simple transposition", isEditDistance1(a, b));
a = "xyzxcvbnm"; b = "yxzxcvbnm";
assertTrue("beginning transposition", isEditDistance1(a, b));
a = "zxcvbnmxyzxcvbnm"; b = "zxcvbnmyxzxcvbnm";
assertTrue("middle transposition", isEditDistance1(a, b));
a = "zxcvbnmxy"; b = "zxcvbnmyx";
assertTrue("ending transposition", isEditDistance1(a, b));
a = "xyz"; b = "yzx";
assertFalse("simple adjacent transpositions", isEditDistance1(a, b));
a = "xyzzxcvbnm"; b = "yzxzxcvbnm";
assertFalse("beginning adjacent transpositions", isEditDistance1(a, b));
a = "zxcvbnmxyzzxcvbnm"; b = "zxcvbnmyzxzxcvbnm";
assertFalse("middle adjacent transpositions", isEditDistance1(a, b));
a = "zxcvbnmxyz"; b = "zxcvbnmyzx";
assertFalse("ending adjacent transpositions", isEditDistance1(a, b));
a = "xyz"; b = "zx";
assertFalse("simple editing a substring more than once", isEditDistance1(a, b));
a = "xyzzxcvbnm"; b = "zxzxcvbnm";
assertFalse("beginning editing a substring more than once", isEditDistance1(a, b));
a = "zxcvbnmxyzzxcvbnm"; b = "zxcvbnmzxzxcvbnm";
assertFalse("middle editing a substring more than once", isEditDistance1(a, b));
a = "zxcvbnmxyz"; b = "zxcvbnmzx";
assertFalse("ending editing a substring more than once", isEditDistance1(a, b));
a = "xy"; b = "ab";
assertFalse("simple edit distance is 2", isEditDistance1(a, b));
a = "xyzxcvbnm"; b = "abzxcvbnm";
assertFalse("beginning edit distance is 2", isEditDistance1(a, b));
a = "zxcvbnmxyzxcvbnm"; b = "zxcvbnmabzxcvbnm";
assertFalse("middle edit distance is 2", isEditDistance1(a, b));
a = "zxcvbnmxy"; b = "zxcvbnmab";
assertFalse("ending edit distance is 2", isEditDistance1(a, b));
a = "xzxcvbnmy"; b = "azxcvbnmb";
assertFalse("discontinuous edit distance is 2", isEditDistance1(a, b));
a = "aaaaabbbbbx"; b = "aaaabbbbby";
assertFalse("subtraction then substitution", isEditDistance1(a, b));
a = "aaaaabbbbb"; b = "aaaababbbby";
assertFalse("transposition then addition", isEditDistance1(a, b));
// generate 100k random strings, randomly mutate them, & ensure
// we're keeping consistent with damerauLevenshteinDistance()
Random random = new Random();
StringMutator mutator = new StringMutator();
for (int i = 0; i < 100000; i++) {
a = new BigInteger(130, random).toString(32);
b = mutator.mutateString(a, 10);
if (damerauLevenshteinDistance(a, b) < 2)
assertTrue("consistent with true DL", isEditDistance1(a, b));
else assertFalse("consistent with true DL", isEditDistance1(a, b));
}
}
/**
* Maximize test coverage by checking toString() method of inner
* Null class.
*/
@Test
public void testNullToString() {
Null myNull = new Null();
assertTrue("Null class toString() not \"Null\"", myNull.toString().equals("Null"));
}
/**
* Facilitates DNA-like mutation of strings; used only for testing
* implementation of Damerau-Levenshtein algorithm.
*/
private class StringMutator {
Random r;
public StringMutator() {
r = new Random();
}
/**
* (Pseudo-)randomly mutates strings.
*
* Goes through a given string one index position at a time, and
* modifies the chars at that position by randomly performing
* either a transposition, substitution, insertion, or deletion
* operation. The odds that any given char will be modified is
* 1 / the mutationFactor parameter. For example, if mutationFactor
* = 1, every character will potentially be modified, while if
* mutationFactor = 10, 1 in 10 characters will be modified.
*
* @param a String to be mutated
* @param mutationFactor Likelihood that any char will be changed
* @return a "mutated" String
*/
public String mutateString(String a, int mutationFactor) {
String b = a; // start with a perfect copy
for (int j = 1; j < b.length() - 1; j++) { // all but last char
if (r.nextInt(mutationFactor) == 0) {
switch (r.nextInt(4)) {
case 0: b = b.substring(0, j) + b.substring(j + 1, j + 2) + b.substring(j, j + 1) + b.substring(j + 2); break; // transposition
case 1: b = b.substring(0, j) + randChar() + b.substring(j + 1); break; // substitution
case 2: b = b.substring(0, j) + randChar() + b.substring(j); break; // insertion
case 3: b = b.substring(0, j) + b.substring(j + 1); break; // deletion
}
}
}
if (r.nextInt(mutationFactor) == 0) { // last char
switch (r.nextInt(4)) {
case 0: break; // can't do transposition here
case 1: b = b.substring(0, b.length() - 1) + "_"; break; // substitution
case 2: b = b.substring(0, b.length()) + "_"; break; // insertion
case 3: b = b.substring(0, b.length() - 1); break; // deletion
}
}
return b;
}
/**
* Generates pseudo-random ASCII characters.
*
* @return a pseudo-random ASCII character
*/
private char randChar() {
return Long.toString(Math.abs(r.nextLong()), 36).charAt(0);
}
}
}