// Copyright 2014 Thomas Müller
// This file is part of MarMoT, which is licensed under GPLv3.
package marmot.test.tokenizer.rules;
import static org.junit.Assert.*;
import marmot.tokenize.rules.RuleProvider;
import marmot.tokenize.rules.RulebasedTransformator;
import org.junit.Test;
public class RulebasedTransformatorTest {
public void testTokRules(String lang, String input, String expected){
RuleProvider p = RuleProvider.createRuleProvider(lang);
RulebasedTransformator r = p.getTokTransformator();
test(r, input, expected);
}
public void testUnTokRules(String lang, String input, String expected){
RuleProvider p = RuleProvider.createRuleProvider(lang);
RulebasedTransformator r = p.getUnTokTransformator();
test(r, input, expected);
}
private void test(RulebasedTransformator r, String input, String expected) {
String actual = r.applyRules(input);
assertEquals(expected, actual);
}
@Test
public void testSpanishTokRules() {
testUnTokRules("es", "no match.", "no match.");
testUnTokRules("es"," del. ", " de el. ");
testUnTokRules("es"," adela del ", " adela de el ");
testUnTokRules("es"," ádela del ", " ádela de el ");
testUnTokRules("es"," ödela del ", " ödela de el ");
testUnTokRules("es"," ñdela del ", " ñdela de el ");
testUnTokRules("es", "Pádel", "Pádel");
testUnTokRules("es", " del ", " de el ");
testUnTokRules("es", "del", "de el");
testUnTokRules("es", " del", " de el");
testUnTokRules("es", "del ", "de el ");
testUnTokRules("es", "(del)", "(de el)");
}
@Test
public void testSpanishUnTokRules() {
testTokRules("es", "dejando_de_lado", "dejando de lado");
testTokRules("es", "José_Ramon", "José Ramon");
testTokRules("es", "3_mayo", "3 mayo");
testTokRules("es", "van_a_conocer ", "van a conocer ");
//testTokRules("es", "6_de_enero_del_2007", "6 de enero de el 2007"); // del --> de el?
//testTokRules("es", "el s .XVII", "el s . XVII");
testTokRules("es", " 1.0 Fz ", " 1.0 ");
testTokRules("es", " 1.0 FZ ", " 1.0 FZ ");
testTokRules("es", " 1.0 Fz", " 1.0");
}
@Test
public void testCzechTokRules() {
testTokRules("cs", "Starověký Bejt Še ' arim", "Starověký Bejt Še ' arim");
testTokRules("cs", "" Elysium "", "\" Elysium \"");
}
}