package org.xbib.elasticsearch.index.mapper.langdetect;
import org.junit.Assert;
import org.junit.Test;
import org.xbib.elasticsearch.common.langdetect.NGram;
public class NGramTest extends Assert {
@Test
public final void testConstants() {
assertEquals(NGram.N_GRAM, 3);
}
@Test
public final void testNormalizeWithLatin() {
assertEquals(NGram.normalize('\u0000'), ' ');
assertEquals(NGram.normalize('\u0009'), ' ');
assertEquals(NGram.normalize('\u0020'), ' ');
assertEquals(NGram.normalize('\u0030'), ' ');
assertEquals(NGram.normalize('\u0040'), ' ');
assertEquals(NGram.normalize('\u0041'), '\u0041');
assertEquals(NGram.normalize('\u005a'), '\u005a');
assertEquals(NGram.normalize('\u005b'), ' ');
assertEquals(NGram.normalize('\u0060'), ' ');
assertEquals(NGram.normalize('\u0061'), '\u0061');
assertEquals(NGram.normalize('\u007a'), '\u007a');
assertEquals(NGram.normalize('\u007b'), ' ');
assertEquals(NGram.normalize('\u007f'), ' ');
assertEquals(NGram.normalize('\u0080'), '\u0080');
assertEquals(NGram.normalize('\u00a0'), ' ');
assertEquals(NGram.normalize('\u00a1'), '\u00a1');
}
/**
* Test method for {@link NGram#normalize(char)} with CJK Kanji characters
*/
@Test
public final void testNormalizeWithCJKKanji() {
assertEquals(NGram.normalize('\u4E00'), '\u4E00');
assertEquals(NGram.normalize('\u4E01'), '\u4E01');
assertEquals(NGram.normalize('\u4E02'), '\u4E02');
assertEquals(NGram.normalize('\u4E03'), '\u4E01');
assertEquals(NGram.normalize('\u4E04'), '\u4E04');
assertEquals(NGram.normalize('\u4E05'), '\u4E05');
assertEquals(NGram.normalize('\u4E06'), '\u4E06');
assertEquals(NGram.normalize('\u4E07'), '\u4E07');
assertEquals(NGram.normalize('\u4E08'), '\u4E08');
assertEquals(NGram.normalize('\u4E09'), '\u4E09');
assertEquals(NGram.normalize('\u4E10'), '\u4E10');
assertEquals(NGram.normalize('\u4E11'), '\u4E11');
assertEquals(NGram.normalize('\u4E12'), '\u4E12');
assertEquals(NGram.normalize('\u4E13'), '\u4E13');
assertEquals(NGram.normalize('\u4E14'), '\u4E14');
assertEquals(NGram.normalize('\u4E15'), '\u4E15');
assertEquals(NGram.normalize('\u4E1e'), '\u4E1e');
assertEquals(NGram.normalize('\u4E1f'), '\u4E1f');
assertEquals(NGram.normalize('\u4E20'), '\u4E20');
assertEquals(NGram.normalize('\u4E21'), '\u4E21');
assertEquals(NGram.normalize('\u4E22'), '\u4E22');
assertEquals(NGram.normalize('\u4E23'), '\u4E23');
assertEquals(NGram.normalize('\u4E24'), '\u4E13');
assertEquals(NGram.normalize('\u4E25'), '\u4E13');
assertEquals(NGram.normalize('\u4E30'), '\u4E30');
}
/**
* Test method for {@link NGram#get(int)} and {@link NGram#addChar(char)}
*/
@Test
public final void testNGram() {
NGram ngram = new NGram();
assertEquals(ngram.get(0), null);
assertEquals(ngram.get(1), null);
assertEquals(ngram.get(2), null);
assertEquals(ngram.get(3), null);
assertEquals(ngram.get(4), null);
ngram.addChar(' ');
assertEquals(ngram.get(1), null);
assertEquals(ngram.get(2), null);
assertEquals(ngram.get(3), null);
ngram.addChar('A');
assertEquals(ngram.get(1), "A");
assertEquals(ngram.get(2), " A");
assertEquals(ngram.get(3), null);
ngram.addChar('\u06cc');
assertEquals(ngram.get(1), "\u064a");
assertEquals(ngram.get(2), "A\u064a");
assertEquals(ngram.get(3), " A\u064a");
ngram.addChar('\u1ea0');
assertEquals(ngram.get(1), "\u1ec3");
assertEquals(ngram.get(2), "\u064a\u1ec3");
assertEquals(ngram.get(3), "A\u064a\u1ec3");
ngram.addChar('\u3044');
assertEquals(ngram.get(1), "\u3042");
assertEquals(ngram.get(2), "\u1ec3\u3042");
assertEquals(ngram.get(3), "\u064a\u1ec3\u3042");
ngram.addChar('\u30a4');
assertEquals(ngram.get(1), "\u30a2");
assertEquals(ngram.get(2), "\u3042\u30a2");
assertEquals(ngram.get(3), "\u1ec3\u3042\u30a2");
ngram.addChar('\u3106');
assertEquals(ngram.get(1), "\u3105");
assertEquals(ngram.get(2), "\u30a2\u3105");
assertEquals(ngram.get(3), "\u3042\u30a2\u3105");
ngram.addChar('\uac01');
assertEquals(ngram.get(1), "\uac00");
assertEquals(ngram.get(2), "\u3105\uac00");
assertEquals(ngram.get(3), "\u30a2\u3105\uac00");
ngram.addChar('\u2010');
assertEquals(ngram.get(1), null);
assertEquals(ngram.get(2), "\uac00 ");
assertEquals(ngram.get(3), "\u3105\uac00 ");
ngram.addChar('a');
assertEquals(ngram.get(1), "a");
assertEquals(ngram.get(2), " a");
assertEquals(ngram.get(3), null);
}
}