package com.cybozu.labs.langdetect.util;
import java.lang.Character.UnicodeBlock;
import java.util.HashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Cut out N-gram from text.
* Users don't use this class directly.
* @author Nakatani Shuyo
*/
public class NGram {
private static final String LATIN1_EXCLUDED = Messages.getString("NGram.LATIN1_EXCLUDE");
public final static int N_GRAM = 3;
public static HashMap<Character, Character> cjk_map;
private StringBuffer grams_;
private boolean capitalword_;
/**
* Constructor.
*/
public NGram() {
grams_ = new StringBuffer(" ");
capitalword_ = false;
}
/**
* Append a character into ngram buffer.
* @param ch
*/
public void addChar(char ch) {
ch = normalize(ch);
char lastchar = grams_.charAt(grams_.length() - 1);
if (lastchar == ' ') {
grams_ = new StringBuffer(" ");
capitalword_ = false;
if (ch==' ') return;
} else if (grams_.length() >= N_GRAM) {
grams_.deleteCharAt(0);
}
grams_.append(ch);
if (Character.isUpperCase(ch)){
if (Character.isUpperCase(lastchar)) capitalword_ = true;
} else {
capitalword_ = false;
}
}
/**
* Get n-Gram
* @param n length of n-gram
* @return n-Gram String (null if it is invalid)
*/
public String get(int n) {
if (capitalword_) return null;
int len = grams_.length();
if (n < 1 || n > 3 || len < n) return null;
if (n == 1) {
char ch = grams_.charAt(len - 1);
if (ch == ' ') return null;
return Character.toString(ch);
} else {
return grams_.substring(len - n, len);
}
}
/**
* Character Normalization
* @param ch
* @return Normalized character
*/
static public char normalize(char ch) {
Character.UnicodeBlock block = Character.UnicodeBlock.of(ch);
if (block == UnicodeBlock.BASIC_LATIN) {
if (ch<'A' || (ch<'a' && ch >'Z') || ch>'z') ch = ' ';
} else if (block == UnicodeBlock.LATIN_1_SUPPLEMENT) {
if (LATIN1_EXCLUDED.indexOf(ch)>=0) ch = ' ';
} else if (block == UnicodeBlock.LATIN_EXTENDED_B) {
// normalization for Romanian
if (ch == '\u0219') ch = '\u015f'; // Small S with comma below => with cedilla
if (ch == '\u021b') ch = '\u0163'; // Small T with comma below => with cedilla
} else if (block == UnicodeBlock.GENERAL_PUNCTUATION) {
ch = ' ';
} else if (block == UnicodeBlock.ARABIC) {
if (ch == '\u06cc') ch = '\u064a'; // Farsi yeh => Arabic yeh
} else if (block == UnicodeBlock.LATIN_EXTENDED_ADDITIONAL) {
if (ch >= '\u1ea0') ch = '\u1ec3';
} else if (block == UnicodeBlock.HIRAGANA) {
ch = '\u3042';
} else if (block == UnicodeBlock.KATAKANA) {
ch = '\u30a2';
} else if (block == UnicodeBlock.BOPOMOFO || block == UnicodeBlock.BOPOMOFO_EXTENDED) {
ch = '\u3105';
} else if (block == UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
if (cjk_map.containsKey(ch)) ch = cjk_map.get(ch);
} else if (block == UnicodeBlock.HANGUL_SYLLABLES) {
ch = '\uac00';
}
return ch;
}
/**
* Normalizer for Vietnamese.
* Normalize Alphabet + Diacritical Mark(U+03xx) into U+1Exx .
* @param text
* @return normalized text
*/
public static String normalize_vi(String text) {
Matcher m = ALPHABET_WITH_DMARK.matcher(text);
StringBuffer buf = new StringBuffer();
while (m.find()) {
int alphabet = TO_NORMALIZE_VI_CHARS.indexOf(m.group(1));
int dmark = DMARK_CLASS.indexOf(m.group(2)); // Diacritical Mark
m.appendReplacement(buf, NORMALIZED_VI_CHARS[dmark].substring(alphabet, alphabet + 1));
}
if (buf.length() == 0)
return text;
m.appendTail(buf);
return buf.toString();
}
private static final String[] NORMALIZED_VI_CHARS = {
Messages.getString("NORMALIZED_VI_CHARS_0300"),
Messages.getString("NORMALIZED_VI_CHARS_0301"),
Messages.getString("NORMALIZED_VI_CHARS_0303"),
Messages.getString("NORMALIZED_VI_CHARS_0309"),
Messages.getString("NORMALIZED_VI_CHARS_0323") };
private static final String TO_NORMALIZE_VI_CHARS = Messages.getString("TO_NORMALIZE_VI_CHARS");
private static final String DMARK_CLASS = Messages.getString("DMARK_CLASS");
private static final Pattern ALPHABET_WITH_DMARK = Pattern.compile("([" + TO_NORMALIZE_VI_CHARS + "])(["
+ DMARK_CLASS + "])");
/**
* CJK Kanji Normalization Mapping
*/
static final String[] CJK_CLASS = {
Messages.getString("NGram.KANJI_1_0"),
Messages.getString("NGram.KANJI_1_2"),
Messages.getString("NGram.KANJI_1_4"),
Messages.getString("NGram.KANJI_1_8"),
Messages.getString("NGram.KANJI_1_11"),
Messages.getString("NGram.KANJI_1_12"),
Messages.getString("NGram.KANJI_1_13"),
Messages.getString("NGram.KANJI_1_14"),
Messages.getString("NGram.KANJI_1_16"),
Messages.getString("NGram.KANJI_1_18"),
Messages.getString("NGram.KANJI_1_22"),
Messages.getString("NGram.KANJI_1_27"),
Messages.getString("NGram.KANJI_1_29"),
Messages.getString("NGram.KANJI_1_31"),
Messages.getString("NGram.KANJI_1_35"),
Messages.getString("NGram.KANJI_2_0"),
Messages.getString("NGram.KANJI_2_1"),
Messages.getString("NGram.KANJI_2_4"),
Messages.getString("NGram.KANJI_2_9"),
Messages.getString("NGram.KANJI_2_10"),
Messages.getString("NGram.KANJI_2_11"),
Messages.getString("NGram.KANJI_2_12"),
Messages.getString("NGram.KANJI_2_13"),
Messages.getString("NGram.KANJI_2_15"),
Messages.getString("NGram.KANJI_2_16"),
Messages.getString("NGram.KANJI_2_18"),
Messages.getString("NGram.KANJI_2_21"),
Messages.getString("NGram.KANJI_2_22"),
Messages.getString("NGram.KANJI_2_23"),
Messages.getString("NGram.KANJI_2_28"),
Messages.getString("NGram.KANJI_2_29"),
Messages.getString("NGram.KANJI_2_30"),
Messages.getString("NGram.KANJI_2_31"),
Messages.getString("NGram.KANJI_2_32"),
Messages.getString("NGram.KANJI_2_35"),
Messages.getString("NGram.KANJI_2_36"),
Messages.getString("NGram.KANJI_2_37"),
Messages.getString("NGram.KANJI_2_38"),
Messages.getString("NGram.KANJI_3_1"),
Messages.getString("NGram.KANJI_3_2"),
Messages.getString("NGram.KANJI_3_3"),
Messages.getString("NGram.KANJI_3_4"),
Messages.getString("NGram.KANJI_3_5"),
Messages.getString("NGram.KANJI_3_8"),
Messages.getString("NGram.KANJI_3_9"),
Messages.getString("NGram.KANJI_3_11"),
Messages.getString("NGram.KANJI_3_12"),
Messages.getString("NGram.KANJI_3_13"),
Messages.getString("NGram.KANJI_3_15"),
Messages.getString("NGram.KANJI_3_16"),
Messages.getString("NGram.KANJI_3_18"),
Messages.getString("NGram.KANJI_3_19"),
Messages.getString("NGram.KANJI_3_22"),
Messages.getString("NGram.KANJI_3_23"),
Messages.getString("NGram.KANJI_3_27"),
Messages.getString("NGram.KANJI_3_29"),
Messages.getString("NGram.KANJI_3_30"),
Messages.getString("NGram.KANJI_3_31"),
Messages.getString("NGram.KANJI_3_32"),
Messages.getString("NGram.KANJI_3_35"),
Messages.getString("NGram.KANJI_3_36"),
Messages.getString("NGram.KANJI_3_37"),
Messages.getString("NGram.KANJI_3_38"),
Messages.getString("NGram.KANJI_4_0"),
Messages.getString("NGram.KANJI_4_9"),
Messages.getString("NGram.KANJI_4_10"),
Messages.getString("NGram.KANJI_4_16"),
Messages.getString("NGram.KANJI_4_17"),
Messages.getString("NGram.KANJI_4_18"),
Messages.getString("NGram.KANJI_4_22"),
Messages.getString("NGram.KANJI_4_24"),
Messages.getString("NGram.KANJI_4_28"),
Messages.getString("NGram.KANJI_4_34"),
Messages.getString("NGram.KANJI_4_39"),
Messages.getString("NGram.KANJI_5_10"),
Messages.getString("NGram.KANJI_5_11"),
Messages.getString("NGram.KANJI_5_12"),
Messages.getString("NGram.KANJI_5_13"),
Messages.getString("NGram.KANJI_5_14"),
Messages.getString("NGram.KANJI_5_18"),
Messages.getString("NGram.KANJI_5_26"),
Messages.getString("NGram.KANJI_5_29"),
Messages.getString("NGram.KANJI_5_34"),
Messages.getString("NGram.KANJI_5_39"),
Messages.getString("NGram.KANJI_6_0"),
Messages.getString("NGram.KANJI_6_3"),
Messages.getString("NGram.KANJI_6_9"),
Messages.getString("NGram.KANJI_6_10"),
Messages.getString("NGram.KANJI_6_11"),
Messages.getString("NGram.KANJI_6_12"),
Messages.getString("NGram.KANJI_6_16"),
Messages.getString("NGram.KANJI_6_18"),
Messages.getString("NGram.KANJI_6_20"),
Messages.getString("NGram.KANJI_6_21"),
Messages.getString("NGram.KANJI_6_22"),
Messages.getString("NGram.KANJI_6_23"),
Messages.getString("NGram.KANJI_6_25"),
Messages.getString("NGram.KANJI_6_28"),
Messages.getString("NGram.KANJI_6_29"),
Messages.getString("NGram.KANJI_6_30"),
Messages.getString("NGram.KANJI_6_32"),
Messages.getString("NGram.KANJI_6_34"),
Messages.getString("NGram.KANJI_6_35"),
Messages.getString("NGram.KANJI_6_37"),
Messages.getString("NGram.KANJI_6_39"),
Messages.getString("NGram.KANJI_7_0"),
Messages.getString("NGram.KANJI_7_3"),
Messages.getString("NGram.KANJI_7_6"),
Messages.getString("NGram.KANJI_7_7"),
Messages.getString("NGram.KANJI_7_9"),
Messages.getString("NGram.KANJI_7_11"),
Messages.getString("NGram.KANJI_7_12"),
Messages.getString("NGram.KANJI_7_13"),
Messages.getString("NGram.KANJI_7_16"),
Messages.getString("NGram.KANJI_7_18"),
Messages.getString("NGram.KANJI_7_19"),
Messages.getString("NGram.KANJI_7_20"),
Messages.getString("NGram.KANJI_7_21"),
Messages.getString("NGram.KANJI_7_23"),
Messages.getString("NGram.KANJI_7_25"),
Messages.getString("NGram.KANJI_7_28"),
Messages.getString("NGram.KANJI_7_29"),
Messages.getString("NGram.KANJI_7_32"),
Messages.getString("NGram.KANJI_7_33"),
Messages.getString("NGram.KANJI_7_35"),
Messages.getString("NGram.KANJI_7_37"),
};
static {
cjk_map = new HashMap<Character, Character>();
for (String cjk_list : CJK_CLASS) {
char representative = cjk_list.charAt(0);
for (int i=0;i<cjk_list.length();++i) {
cjk_map.put(cjk_list.charAt(i), representative);
}
}
}
}