// Copyright 2014 Thomas Müller // This file is part of MarMoT, which is licensed under GPLv3. package marmot.test.tokenizer.opennlp; import static org.junit.Assert.*; import java.util.LinkedList; import java.util.List; import marmot.tokenize.openlp.Aligner; import marmot.tokenize.openlp.Aligner.Result; import marmot.tokenize.openlp.Aligner.ResultType; import marmot.tokenize.openlp.LevenshteinAligner; import marmot.tokenize.openlp.Aligner.Pair; import org.junit.Test; public class LevenshteinAlignerTest { public void bothWayTest(String tok, String untok, List<Pair> expected, long time) { Result actual; Aligner a = new LevenshteinAligner(); actual = a.align(untok, tok); assertEquals(expected, actual.pairs); // Reverse problem List<Pair> expected_reversed = new LinkedList<Pair>(); for (Pair pair : expected) { expected_reversed.add(new Pair(pair.b, pair.a)); } actual = a.align(tok, untok); assertEquals(expected_reversed, actual.pairs); } public void bothWayTest(String tok, String untok, List<Pair> expected) { bothWayTest(tok, untok, expected, 1000); } @Test public void toyExampleAlignTest() { String tok, untok; List<Pair> expected; tok = "A -LRB- B"; untok = "A (B"; expected = new LinkedList<Pair>(); expected.add(new Pair(0, 0)); expected.add(new Pair(1, 1)); expected.add(new Pair(2, 2)); expected.add(new Pair(2, 3)); expected.add(new Pair(2, 4)); expected.add(new Pair(2, 5)); expected.add(new Pair(2, 6)); expected.add(new Pair(-1, 7)); expected.add(new Pair(3, 8)); bothWayTest(tok, untok, expected); tok = "A -- C -- B"; untok = "A (C) B"; expected = new LinkedList<Pair>(); expected.add(new Pair(0, 0)); expected.add(new Pair(1, 1)); expected.add(new Pair(2, 2)); expected.add(new Pair(2, 3)); expected.add(new Pair(-1, 4)); expected.add(new Pair(3, 5)); expected.add(new Pair(-1, 6)); expected.add(new Pair(4, 7)); expected.add(new Pair(4, 8)); expected.add(new Pair(5, 9)); expected.add(new Pair(6, 10)); bothWayTest(tok, untok, expected); tok = "A L B"; untok = "A (B"; expected = new LinkedList<Pair>(); expected.add(new Pair(0, 0)); expected.add(new Pair(1, 1)); expected.add(new Pair(2, 2)); expected.add(new Pair(-1, 3)); expected.add(new Pair(3, 4)); bothWayTest(tok, untok, expected); } @Test public void insertProblemTest() { String tok, untok; List<Pair> expected; tok = "A -- -- B"; untok = "A () B"; expected = new LinkedList<Pair>(); expected.add(new Pair(0, 0)); expected.add(new Pair(1, 1)); expected.add(new Pair(2, 2)); expected.add(new Pair(2, 3)); expected.add(new Pair(-1, 4)); expected.add(new Pair(3, 5)); expected.add(new Pair(3, 6)); expected.add(new Pair(4, 7)); expected.add(new Pair(5, 8)); bothWayTest(tok, untok, expected); } public List<Pair> getParanExpected(String tok) { List<Pair> expected = new LinkedList<Pair>(); int untok_index = 0; for (int i=0; i<tok.length(); i++) { if (i % 3 == 2) { expected.add(new Pair(-1, i)); untok_index += 1; } else { expected.add(new Pair(untok_index, i)); } } return expected; } @Test public void paranTest() { String tok, untok; List<Pair> expected; tok = "-- --"; untok = "()"; expected = new LinkedList<Pair>(); expected.add(new Pair(0, 0)); expected.add(new Pair(0, 1)); expected.add(new Pair(-1, 2)); expected.add(new Pair(1, 3)); expected.add(new Pair(1, 4)); bothWayTest(tok, untok, expected); tok = "-- -- -- --"; untok = "(())"; expected = getParanExpected(tok); bothWayTest(tok, untok, expected); tok = "-- -- -- -- -- -- -- --"; untok = "(((())))"; expected = getParanExpected(tok); bothWayTest(tok, untok, expected); tok = "-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --"; untok = "(((((((())))))))"; expected = getParanExpected(tok); bothWayTest(tok, untok, expected); } @Test public void bigParanTest() { // Algorithm is to slow for the following example. But it's pretty extreme ... String tok, untok; tok = "-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --"; untok = "(((((((((((((((())))))))))))))))"; expectTimeout(tok, untok, 1000); } private void expectTimeout(String tok, String untok, long time) { expectNoAlign(tok, untok, ResultType.Timeout, time); } private void expectNoAlign(String tok, String untok) { expectNoAlign(tok, untok, ResultType.NoAlignmentFound, 1000); } private void expectNoAlign(String tok, String untok, ResultType type, long time) { Aligner a = new LevenshteinAligner(time); Result actual = a.align(untok, tok); System.err.println(actual); assertEquals(null, actual.pairs); assertEquals(type, actual.result_type); } @Test public void quasiRealWorldTest() { String tok, untok; List<Pair> expected; tok = "„ „ AAA “ “"; untok = "\"\"AAA\"\""; expected = new LinkedList<Pair>(); expected.add(new Pair(0, 0)); expected.add(new Pair(-1, 1)); expected.add(new Pair(1, 2)); expected.add(new Pair(-1, 3)); expected.add(new Pair(2, 4)); expected.add(new Pair(3, 5)); expected.add(new Pair(4, 6)); expected.add(new Pair(-1, 7)); expected.add(new Pair(5, 8)); expected.add(new Pair(-1, 9)); expected.add(new Pair(6, 10)); bothWayTest(tok, untok, expected); tok = "B BBB B"; untok = "B(B"; expected = new LinkedList<Pair>(); expected.add(new Pair(0, 0)); expected.add(new Pair(-1, 1)); expected.add(new Pair(1, 2)); expected.add(new Pair(1, 3)); expected.add(new Pair(1, 4)); expected.add(new Pair(-1, 5)); expected.add(new Pair(2, 6)); bothWayTest(tok, untok, expected); } @Test public void realSentenceTest() { //timeout is to expected String tok, untok; List<Pair> expected = new LinkedList<>(); tok = "VI Fz Fz C."; untok = "VI C."; expectNoAlign(tok, untok); tok = "dá me lo"; untok = "dámelo"; expected.clear(); expected.add(new Pair(0,0)); expected.add(new Pair(1,1)); expected.add(new Pair(-1,2)); expected.add(new Pair(2,3)); expected.add(new Pair(3,4)); expected.add(new Pair(-1,5)); expected.add(new Pair(4,6)); expected.add(new Pair(5,7)); bothWayTest(tok, untok, expected); tok = "de el"; untok = "deL"; expected.clear(); expected.add(new Pair(0,0)); expected.add(new Pair(1,1)); expected.add(new Pair(-1,2)); expected.add(new Pair(2,3)); expected.add(new Pair(2,4)); bothWayTest(tok, untok, expected); tok = "de el"; untok = "del"; expectNoAlign(tok, untok); // untok contains a weird space character (char value 160 instead of 32) tok = "L 1 bis L 499"; untok = "L 1 bis L 499"; expected.clear(); for (int i=0; i<tok.length(); i++) { expected.add(new Pair(i,i)); } bothWayTest(tok, untok, expected); // untok contains a weird space character (char value 160 instead of 32) tok = "vorgesehen , für diejenigen im Gebiet des Landschaftsverbandes Westfalen-Lippe die Bezeichnungen von L 501 bis L 999 ."; untok = "vorgesehen, für diejenigen im Gebiet des Landschaftsverbandes Westfalen-Lippe die Bezeichnungen von L 501 bis L 999."; expected.clear(); for (int i=0; i< 10; i++) { expected.add(new Pair(i,i)); } expected.add(new Pair(-1, 10)); for (int i=10; i< untok.length() - 1; i++) { expected.add(new Pair(i,i + 1)); } expected.add(new Pair(-1, untok.length())); expected.add(new Pair(untok.length() - 1, untok.length() + 1)); bothWayTest(tok, untok, expected); } }