package edu.berkeley.cs.nlp.ocular.data.textreader;
import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.*;
import static java.util.Arrays.asList;
import static org.junit.Assert.*;
import java.util.List;
import org.junit.Test;
import edu.berkeley.cs.nlp.ocular.util.Tuple2;
/**
* @author Dan Garrette (dhgarrette@gmail.com)
*/
public class CharsetTests {
@Test
public void test_isPunctuationChar() {
assertFalse(isPunctuationChar("t"));
assertFalse(isPunctuationChar("q̃"));
assertFalse(isPunctuationChar("\\~q"));
assertFalse(isPunctuationChar("\\`\\'ñ" + MACRON_COMBINING + DIAERESIS_COMBINING));
assertTrue(isPunctuationChar(";"));
assertTrue(isPunctuationChar("\\\\"));
try { isPunctuationChar(";;"); fail("no exception thrown"); } catch (RuntimeException e) { e.getMessage().contains("contains more than one character"); }
}
@Test
public void test_unescapeChar() {
assertEquals("ñ" + MACRON_COMBINING + DIAERESIS_COMBINING + ACUTE_COMBINING + GRAVE_COMBINING, unescapeChar("\\`\\'ñ" + MACRON_COMBINING + DIAERESIS_COMBINING));
assertEquals("ñ" + MACRON_COMBINING + DIAERESIS_COMBINING + ACUTE_COMBINING + GRAVE_COMBINING, unescapeChar("\\`\\'n" + TILDE_COMBINING + MACRON_COMBINING + DIAERESIS_COMBINING));
assertEquals("q" + TILDE_COMBINING + MACRON_COMBINING + DIAERESIS_COMBINING + ACUTE_COMBINING + GRAVE_COMBINING, unescapeChar("\\`\\'q" + TILDE_COMBINING + MACRON_COMBINING + DIAERESIS_COMBINING));
assertEquals("ñ", unescapeChar("ñ"));
assertEquals("ñ", unescapeChar("\\~n"));
assertEquals("q" + TILDE_COMBINING, unescapeChar("q" + TILDE_COMBINING));
assertEquals("q" + TILDE_COMBINING, unescapeChar("\\~q"));
assertEquals("ı", unescapeChar("\\ii"));
assertEquals("ı", unescapeChar("ı"));
assertEquals("\\", unescapeChar("\\\\"));
}
@Test
public void test_unescapeChar_precomposedOnly() {
assertEquals(GRAVE_ESCAPE + ACUTE_ESCAPE + DIAERESIS_ESCAPE + MACRON_ESCAPE + "ñ", unescapeChar("\\`\\'ñ" + MACRON_COMBINING + DIAERESIS_COMBINING, true));
assertEquals(GRAVE_ESCAPE + ACUTE_ESCAPE + DIAERESIS_ESCAPE + MACRON_ESCAPE + "ñ", unescapeChar("\\`\\'n" + TILDE_COMBINING + MACRON_COMBINING + DIAERESIS_COMBINING, true));
assertEquals(GRAVE_ESCAPE + ACUTE_ESCAPE + DIAERESIS_ESCAPE + MACRON_ESCAPE + TILDE_ESCAPE + "q", unescapeChar("\\`\\'q" + TILDE_COMBINING + MACRON_COMBINING + DIAERESIS_COMBINING, true));
assertEquals("ñ", unescapeChar("ñ", true));
assertEquals("ñ", unescapeChar("\\~n", true));
assertEquals("\\~q", unescapeChar("q" + TILDE_COMBINING, true));
assertEquals("\\~q", unescapeChar("\\~q", true));
assertEquals("ı", unescapeChar("\\ii", true));
assertEquals("ı", unescapeChar("ı", true));
assertEquals("\\", unescapeChar("\\\\", true));
}
@Test
public void test_fullyEscapeChar() {
assertEquals(GRAVE_ESCAPE + ACUTE_ESCAPE + DIAERESIS_ESCAPE + MACRON_ESCAPE + TILDE_ESCAPE + "n", fullyEscapeChar("\\`\\'ñ" + MACRON_COMBINING + DIAERESIS_COMBINING));
assertEquals(GRAVE_ESCAPE + ACUTE_ESCAPE + DIAERESIS_ESCAPE + MACRON_ESCAPE + TILDE_ESCAPE + "n", fullyEscapeChar("\\`\\'n" + TILDE_COMBINING + MACRON_COMBINING + DIAERESIS_COMBINING));
assertEquals(GRAVE_ESCAPE + ACUTE_ESCAPE + DIAERESIS_ESCAPE + MACRON_ESCAPE + TILDE_ESCAPE + "q", fullyEscapeChar("\\`\\'q" + TILDE_COMBINING + MACRON_COMBINING + DIAERESIS_COMBINING));
assertEquals("\\~n", fullyEscapeChar("ñ"));
assertEquals("\\~n", fullyEscapeChar("\\~n"));
assertEquals("\\~q", fullyEscapeChar("q" + TILDE_COMBINING));
assertEquals("\\~q", fullyEscapeChar("\\~q"));
assertEquals("\\ii", fullyEscapeChar("\\ii"));
assertEquals("\\ii", fullyEscapeChar("ı"));
assertEquals("\\\\", fullyEscapeChar("\\\\"));
}
@Test
public void test_normalizeCharSeparateDiacritics() {
assertEquals(asList(TILDE_COMBINING, MACRON_COMBINING, DIAERESIS_COMBINING, ACUTE_COMBINING, GRAVE_COMBINING), normalizeCharSeparateDiacritics("\\`\\'ñ" + MACRON_COMBINING + DIAERESIS_COMBINING)._2);
assertEquals(asList(TILDE_COMBINING, MACRON_COMBINING, DIAERESIS_COMBINING, ACUTE_COMBINING, GRAVE_COMBINING), normalizeCharSeparateDiacritics("\\`\\'n" + TILDE_COMBINING + MACRON_COMBINING + DIAERESIS_COMBINING)._2);
assertEquals(asList(TILDE_COMBINING, MACRON_COMBINING, DIAERESIS_COMBINING, ACUTE_COMBINING, GRAVE_COMBINING), normalizeCharSeparateDiacritics("\\`\\'q" + TILDE_COMBINING + MACRON_COMBINING + DIAERESIS_COMBINING)._2);
assertEquals(asList(), normalizeCharSeparateDiacritics("t")._2);
assertEquals(asList(TILDE_COMBINING), normalizeCharSeparateDiacritics("ñ")._2);
assertEquals(asList(TILDE_COMBINING), normalizeCharSeparateDiacritics("\\~n")._2);
assertEquals(asList(TILDE_COMBINING), normalizeCharSeparateDiacritics("q̃")._2);
assertEquals(asList(TILDE_COMBINING), normalizeCharSeparateDiacritics("q" + TILDE_COMBINING)._2);
assertEquals(asList(TILDE_COMBINING), normalizeCharSeparateDiacritics("\\~q")._2);
assertEquals(asList(), normalizeCharSeparateDiacritics("\\\\")._2);
assertEquals("n", normalizeCharSeparateDiacritics("\\`\\'ñ" + MACRON_COMBINING + DIAERESIS_COMBINING)._1);
assertEquals("n", normalizeCharSeparateDiacritics("\\`\\'n" + TILDE_COMBINING + MACRON_COMBINING + DIAERESIS_COMBINING)._1);
assertEquals("q", normalizeCharSeparateDiacritics("\\`\\'q" + TILDE_COMBINING + MACRON_COMBINING + DIAERESIS_COMBINING)._1);
assertEquals("t", normalizeCharSeparateDiacritics("t")._1);
assertEquals("n", normalizeCharSeparateDiacritics("ñ")._1);
assertEquals("n", normalizeCharSeparateDiacritics("\\~n")._1);
assertEquals("q", normalizeCharSeparateDiacritics("q̃")._1);
assertEquals("q", normalizeCharSeparateDiacritics("q" + TILDE_COMBINING)._1);
assertEquals("q", normalizeCharSeparateDiacritics("\\~q")._1);
assertEquals("\\\\", normalizeCharSeparateDiacritics("\\\\")._1);
try {
Tuple2<String,List<String>> r = normalizeCharSeparateDiacritics(MACRON_ESCAPE + "" + TILDE_COMBINING);
fail("Exception expected, found: ["+r+"]");
} catch (RuntimeException e) {
//assertEquals("Character contains only escape codes!", e.getMessage());
}
}
@Test
public void test_removeAnyDiacriticFromChar() {
assertEquals("n", removeAnyDiacriticFromChar("\\`\\'ñ" + MACRON_COMBINING + DIAERESIS_COMBINING));
assertEquals("n", removeAnyDiacriticFromChar("\\`\\'n" + TILDE_COMBINING + MACRON_COMBINING + DIAERESIS_COMBINING));
assertEquals("q", removeAnyDiacriticFromChar("\\`\\'q" + TILDE_COMBINING + MACRON_COMBINING + DIAERESIS_COMBINING));
assertEquals("t", removeAnyDiacriticFromChar("t"));
assertEquals("n", removeAnyDiacriticFromChar("ñ"));
assertEquals("n", removeAnyDiacriticFromChar("\\~n"));
assertEquals("q", removeAnyDiacriticFromChar("q̃"));
assertEquals("q", removeAnyDiacriticFromChar("q" + TILDE_COMBINING));
assertEquals("q", removeAnyDiacriticFromChar("\\~q"));
assertEquals("\\\\", removeAnyDiacriticFromChar("\\\\"));
}
@Test
public void test_normalizeChar() {
assertEquals("t", normalizeChar("t"));
assertEquals("q" + TILDE_COMBINING, normalizeChar("q̃"));
assertEquals("q" + TILDE_COMBINING, normalizeChar("q" + TILDE_COMBINING));
assertEquals("q" + TILDE_COMBINING, normalizeChar("\\~q"));
assertEquals("n" + TILDE_COMBINING, normalizeChar("ñ"));
assertEquals("n" + TILDE_COMBINING, normalizeChar("\\~n"));
assertEquals("a" + ACUTE_COMBINING, normalizeChar("á"));
assertEquals("ı", normalizeChar("ı"));
assertEquals("ı", normalizeChar("\\ii"));
assertEquals("a\u0347", normalizeChar("a\u0347"));
assertEquals("n" + TILDE_COMBINING + MACRON_COMBINING + DIAERESIS_COMBINING + ACUTE_COMBINING + GRAVE_COMBINING, normalizeChar("\\`\\'ñ" + MACRON_COMBINING + DIAERESIS_COMBINING));
assertEquals("n" + TILDE_COMBINING + MACRON_COMBINING + DIAERESIS_COMBINING + ACUTE_COMBINING + GRAVE_COMBINING, normalizeChar("\\`\\'n" + TILDE_COMBINING + MACRON_COMBINING + DIAERESIS_COMBINING));
assertEquals("q" + TILDE_COMBINING + MACRON_COMBINING + DIAERESIS_COMBINING + ACUTE_COMBINING + GRAVE_COMBINING, normalizeChar("\\`\\'q" + TILDE_COMBINING + MACRON_COMBINING + DIAERESIS_COMBINING));
assertEquals("\\\\", normalizeChar("\\\\"));
}
@Test
public void test_readNormalizeCharacters() {
assertEquals(asList("a", "b\u0311", "c", "d"), readNormalizeCharacters("ab\u0311cd"));
assertEquals(asList("a", "b\uFE20", "c\uFE21", "d"), readNormalizeCharacters("ab\uFE20c\uFE21d"));
assertEquals(asList("a", "b\u0361", "c", "d"), readNormalizeCharacters("ab\u0361cd"));
assertEquals(asList("t", "a", "u\u0361", "g", "a", "a", "m"), readNormalizeCharacters("tau͡gaam"));
}
}