package org.activityinfo.core.shared.importing.match.names;
import com.google.common.base.Charsets;
import com.google.common.base.Strings;
import com.google.common.collect.Lists;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;
import java.io.IOException;
import java.util.List;
import static com.google.common.io.Resources.getResource;
import static com.google.common.io.Resources.readLines;
public class NameMatchingIntegrationTest {
public static final int COLUMN_WIDTH = 30;
private LatinPlaceNameScorer scorer;
private List<String> a;
private List<String> b;
private int falsePositives = 0;
private int noMatch = 0;
private int matches = 0;
@Before
public void setUp() {
scorer = new LatinPlaceNameScorer();
}
@Test
@Ignore
public void test() throws IOException {
test("lebanon.txt");
test("philipines.txt");
test("mali.txt");
System.out.println("MATCHED: " + matches);
System.out.println("FALSE POSITIVES: " + falsePositives);
System.out.println("NO MATCH: " + noMatch);
}
private void test(String resource) throws IOException {
loadTestSet(resource);
printReport();
}
private void printReport() {
for(int i=0;i!= a.size();++i) {
String x = a.get(i);
int bestMatch = findBestMatch(x, b);
String correctMatch = b.get(i);
double correctScore = scorer.score(x, b.get(i));
if(bestMatch == -1) {
noMatch++;
System.out.println(Strings.padEnd(x, COLUMN_WIDTH, ' ' ) +
" FAILED TO MATCH " + formatScore(correctScore) + correctMatch);
} else if(bestMatch == i || b.get(bestMatch).equals(b.get(i))) {
matches++;
} else {
String matchedName = b.get(bestMatch);
double matchedScore = scorer.score(x, b.get(bestMatch));
System.out.println(Strings.padEnd(x, COLUMN_WIDTH, ' ' ) +
" MATCHED " + formatScore(matchedScore) +
Strings.padEnd(matchedName, COLUMN_WIDTH, ' ') +
" INSTEAD OF " + formatScore(correctScore) + correctMatch);
falsePositives++;
}
}
}
private void loadTestSet(String resourceName) throws IOException {
// Each line in our test sets contain a pair of names that
// refer to the same entity, but differ by spelling, transliteration method
// or other messiness
a = Lists.newArrayList();
b = Lists.newArrayList();
List<String> testSet = readLines(getResource(LatinPlaceNameScorer.class, resourceName), Charsets.UTF_8);
for(String pair : testSet) {
String[] columns = pair.split("\\|");
if(columns.length != 2) {
throw new IOException("Bad format for line: " + pair);
}
a.add(columns[0]);
b.add(columns[1]);
}
}
private String formatScore(double score) {
return " (" + Strings.padStart(Integer.toString((int) (score*100d)), 3, ' ') + ") ";
}
private int findBestMatch(String x, List<String> b) {
double bestScore = 0;
int bestMatch = -1;
for(int i=0;i!=b.size();++i) {
String y = b.get(i);
double score = scorer.score(x, y);
if(score > bestScore) {
bestMatch = i;
bestScore = score;
}
}
return bestMatch;
}
@Test
public void nonMatches() {
// These should not match at all
// They are a separate set of entities
// and it's obvious to the naked eye that they
// are nowhere near close
// it's important that our scorer
// is able to see this
List<String> regions = Lists.<String>newArrayList(
"Region de Bruxelles-Capitale / Brussels Hoofdstede",
"Vlaams Gewest",
"Region wallonne");
assertNoMatch("West-Vla", regions);
assertNoMatch("Oost-Vla", regions);
assertNoMatch("Antwerpen", regions);
assertNoMatch("Brussel", regions);
assertNoMatch("Limburg", regions);
assertNoMatch("Vla-Bra", regions);
}
private void assertNoMatch(String province, List<String> regions) {
LatinPlaceNameScorer scorer = new LatinPlaceNameScorer();
for(String region : regions) {
double score = scorer.score(province, region);
if(score > 0) {
throw new AssertionError(province + " matched " + region + formatScore(score));
}
}
}
}