package com.cybozu.labs.langdetect.util; import org.json.JSONArray; import org.json.JSONObject; import java.util.HashMap; import java.util.Iterator; import java.util.Set; /** * {@link LangProfile} is a Language Profile Class. * Users don't use this class directly. * * @author Nakatani Shuyo */ public class LangProfile { private static final int MINIMUM_FREQ = 2; private static final int LESS_FREQ_RATIO = 100000; public String name = null; public HashMap<String, Integer> freq = new HashMap<String, Integer>(); public int[] n_words = new int[NGram.N_GRAM]; /** * Constructor for JSONIC */ public LangProfile() {} /** * Normal Constructor * @param name language name */ public LangProfile(String name) { this.name = name; } public LangProfile(JSONObject json) { this.name = json.getString("name"); this.freq = toFreq(json.getJSONObject("freq")); this.n_words = toNWords(json.getJSONArray("n_words")); } private int[] toNWords(JSONArray json) { int[] result = new int[json.length()]; for(int i=0; i<json.length(); i++){ result[i] = json.getInt(i); } return result; } private HashMap<String, Integer> toFreq(JSONObject freq) { HashMap<String, Integer> result = new HashMap<String, Integer>(); Iterator keys = freq.keys(); while(keys.hasNext()) { String key = (String) keys.next(); result.put(key, freq.getInt(key)); } return result; } /** * Add n-gram to profile * @param gram */ public void add(String gram) { if (name == null || gram == null) return; // Illegal int len = gram.length(); if (len < 1 || len > NGram.N_GRAM) return; // Illegal ++n_words[len - 1]; if (freq.containsKey(gram)) { freq.put(gram, freq.get(gram) + 1); } else { freq.put(gram, 1); } } /** * Eliminate below less frequency n-grams and noise Latin alphabets */ public void omitLessFreq() { if (name == null) return; // Illegal int threshold = n_words[0] / LESS_FREQ_RATIO; if (threshold < MINIMUM_FREQ) threshold = MINIMUM_FREQ; Set<String> keys = freq.keySet(); int roman = 0; for(Iterator<String> i = keys.iterator(); i.hasNext(); ){ String key = i.next(); int count = freq.get(key); if (count <= threshold) { n_words[key.length()-1] -= count; i.remove(); } else { if (key.matches("^[A-Za-z]$")) { roman += count; } } } // roman check if (roman < n_words[0] / 3) { Set<String> keys2 = freq.keySet(); for(Iterator<String> i = keys2.iterator(); i.hasNext(); ){ String key = i.next(); if (key.matches(".*[A-Za-z].*")) { n_words[key.length()-1] -= freq.get(key); i.remove(); } } } } /** * Update the language profile with (fragmented) text. * Extract n-grams from text and add their frequency into the profile. * @param text (fragmented) text to extract n-grams */ public void update(String text) { if (text == null) return; text = NGram.normalize_vi(text); NGram gram = new NGram(); for(int i=0; i<text.length(); ++i) { gram.addChar(text.charAt(i)); for(int n=1; n<=NGram.N_GRAM; ++n) { add(gram.get(n)); } } } }