package org.activityinfo.core.shared.importing.match.names;
import static java.lang.Math.min;
/**
* Compares two words written in latin script.
*/
public class LatinWordDistance {
static final double EXTRA_VOWEL_COST = 0.30;
static final double TRAILING_VOWELS_COST = 0.5;
static final double VOWEL_SUBSTITUTION_COST = 0.20;
static final double DOUBLED_CONSONANT_COST = 0.5;
static final double CLOSING_CONSONANT_COST = 0.75;
static final char NOTHING = 32;
public static class Word {
private char[] chars;
private int start;
/**
* The index of the end of the underlying character array (exclusive)
*/
private int end;
public void set(String s) {
this.chars = s.toCharArray();
this.start = 0;
this.end = s.length();
}
public void set(char[] charArray, int start, int end) {
this.chars = charArray;
this.start = start;
this.end = end;
}
public boolean isEndOfWord(int i) {
return i >= end;
}
public char at(int i) {
if(i < end) {
return chars[i];
} else {
return NOTHING;
}
}
public boolean isRepeated(int i) {
return i > start && chars[i] == chars[i-1];
}
public boolean isVowel(int i) {
return i >= start && i < end && isVowelChar(chars[i]);
}
public boolean isLast(int i) {
return i+1 == end;
}
public int length() {
return end - start;
}
}
private final ConsonantSimilarity consonants = ConsonantSimilarity.get();
private final Word x = new Word();
private final Word y = new Word();
public static boolean isVowelChar(char c) {
return c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U';
}
public double similarity(String xs, String ys) {
this.x.set(xs);
this.y.set(ys);
return similarity();
}
public double similarity(char[] x, int x0, int x1, char[] y, int y0, int y1) {
this.x.set(x, x0, x1);
this.y.set(y, y0, y1);
return similarity();
}
private double similarity() {
double distance = distance(this.x.start, this.y.start);
if(Double.isInfinite(distance)) {
return 0;
}
double n = (double) Math.max(this.x.length(), this.y.length());
return (n-distance) / n;
}
public double distance(String xs, String ys) {
x.set(xs);
y.set(ys);
return distance(x.start, y.start);
}
private double distance(int i, int j) {
boolean eox = x.isEndOfWord(i);
boolean eoy = y.isEndOfWord(j);
if(eox && eoy) {
return 0;
} else if(eox) {
return trailingDistance(y, j);
} else if(eoy) {
return trailingDistance(x, i);
} else {
double d = Double.POSITIVE_INFINITY;
// if(x.isVowel(i) && y.isVowel(j)) {
// double substitutionCost = nucleusDistance(i, j);
// }
double substitutionCost = substitutionDistance(i,j);
if(substitutionCost < d) {
substitutionCost += distance(i+1, j+1);
d = min(d, substitutionCost);
}
double insertionCostX = insertionCost(x, i);
if(insertionCostX < d) {
insertionCostX += distance(i+1, j);
d = min(d, insertionCostX);
}
double insertionCostY = insertionCost(y, j);
if(insertionCostY < d) {
insertionCostY += distance(i, j+1);
d = min(d, insertionCostY);
}
return d;
}
}
/**
* Calculates the distance between the nuclei of two syllables
*/
private double nucleusDistance(int i, int j) {
int xn = 1;
while(x.isVowel(i+xn)) {
xn++;
}
int yn = 1;
while(y.isVowel(i+yn)) {
yn++;
}
return -1;
}
/**
* Assign a cost to inserting the character at i in word {@code w}
*/
private double insertionCost(Word w, int i) {
char c = w.at(i);
if(isVowelChar(c)) {
return EXTRA_VOWEL_COST;
} else if(c == 'Y') {
if(w.isVowel(i-1)) {
return 1.0;
}
} else /* consonants */ {
if(w.isRepeated(i)) {
return DOUBLED_CONSONANT_COST;
}
int next = w.at(i + 1);
if(c == 'D' && next == 'J') {
return 0.5;
} else if(c == 'N' && (next == 'D' || next == 'G' || next == 'K')) {
return 0.5;
}
}
return Double.POSITIVE_INFINITY;
}
private double substitutionDistance(int i, int j) {
char cx = x.at(i);
char cy = y.at(j);
if(cx == cy) {
return 0;
}
boolean vx = isVowelChar(cx);
boolean vy = isVowelChar(cy);
if(vx && vy) {
if(cx < cy) {
return vowelDistance(cx, cy);
} else {
return vowelDistance(cy, cx);
}
} else if(!vx && !vy) {
return consonants.distance(cx, cy);
} else {
return Double.POSITIVE_INFINITY;
}
}
private double vowelDistance(char k, char m) {
if(k == 'A' && (m == 'E')) {
return 0.25;
} else if(k == 'E' && (m == 'I')) {
return 0.25;
} else if(k == 'O' && (m == 'U')) {
return 0.25;
} else {
return 1.0;
}
}
/**
* We consider extra letters largely to be infinitely far,
* with the exception of extra vowels, which are often dropped
* or
*/
private double trailingDistance(Word word, int i) {
while(!word.isEndOfWord(i)) {
char c = word.at(i);
if(!isVowelChar(c)) {
if(word.isLast(i) && word.isVowel(i-1) &&
(c == 'H' || c == 'Y' || c == 'T')) {
return CLOSING_CONSONANT_COST;
} else {
return Double.POSITIVE_INFINITY;
}
}
i++;
}
return TRAILING_VOWELS_COST;
}
}