package org.xbib.elasticsearch.common.langdetect; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.elasticsearch.ElasticsearchException; import org.elasticsearch.common.settings.Settings; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Random; import java.util.regex.Pattern; /** * */ public class LangdetectService { private static final String[] DEFAULT_LANGUAGES = new String[]{ // "af", "ar", "bg", "bn", "cs", "da", "de", "el", "en", "es", "et", "fa", "fi", "fr", "gu", "he", "hi", "hr", "hu", "id", "it", "ja", // "kn", "ko", "lt", "lv", "mk", "ml", // "mr", // "ne", "nl", "no", "pa", "pl", "pt", "ro", "ru", // "sk", //"sl", // "so", "sq", "sv", // "sw", "ta", "te", "th", "tl", "tr", "uk", "ur", "vi", "zh-cn", "zh-tw" }; private static final Logger logger = LogManager.getLogger(LangdetectService.class.getName()); private static final Pattern word = Pattern.compile("[\\P{IsWord}]", Pattern.UNICODE_CHARACTER_CLASS); private static final Settings DEFAULT_SETTINGS = Settings.builder() .putArray("languages", DEFAULT_LANGUAGES) .build(); private final Settings settings; private Map<String, double[]> wordLangProbMap = new HashMap<>(); private List<String> langlist = new LinkedList<>(); private Map<String, String> langmap = new HashMap<>(); private String profile; private double alpha; private double alphaWidth; private int nTrial; private double[] priorMap; private int iterationLimit; private double probThreshold; private double convThreshold; private int baseFreq; private Pattern filterPattern; private boolean isStarted; public LangdetectService() { this(DEFAULT_SETTINGS); } public LangdetectService(Settings settings) { this(settings, null); } public LangdetectService(Settings settings, String profile) { this.settings = settings; this.profile = settings.get("profile", profile); load(settings); init(); } public Settings getSettings() { return settings; } private void load(Settings settings) { if (settings.equals(Settings.EMPTY)) { return; } try { String[] keys = settings.getAsArray("languages"); if (keys.length == 0) { keys = DEFAULT_LANGUAGES; } int index = 0; int size = keys.length; for (String key : keys) { if (key != null && !key.isEmpty()) { loadProfileFromResource(key, index++, size); } } logger.debug("language detection service installed for {}", langlist); } catch (Exception e) { logger.error(e.getMessage(), e); throw new ElasticsearchException(e.getMessage() + " profile=" + profile); } try { // map by settings Settings map = Settings.builder().put(settings.getByPrefix("map.")).build(); if (map.getAsMap().isEmpty()) { // is in "map" a resource name? String s = settings.get("map") != null ? settings.get("map") : this.profile + "language.json"; InputStream in = getClass().getResourceAsStream(s); if (in != null) { map = Settings.builder().loadFromStream(s, in).build(); } } this.langmap = map.getAsMap(); } catch (Exception e) { logger.error(e.getMessage(), e); throw new ElasticsearchException(e.getMessage()); } } private void init() { this.priorMap = null; this.nTrial = settings.getAsInt("number_of_trials", 7); this.alpha = settings.getAsDouble("alpha", 0.5); this.alphaWidth = settings.getAsDouble("alpha_width", 0.05); this.iterationLimit = settings.getAsInt("iteration_limit", 10000); this.probThreshold = settings.getAsDouble("prob_threshold", 0.1); this.convThreshold = settings.getAsDouble("conv_threshold", 0.99999); this.baseFreq = settings.getAsInt("base_freq", 10000); this.filterPattern = settings.get("pattern") != null ? Pattern.compile(settings.get("pattern"), Pattern.UNICODE_CHARACTER_CLASS) : null; isStarted = true; } public void loadProfileFromResource(String resource, int index, int langsize) throws IOException { String thisProfile = "/langdetect/" + (this.profile != null ? this.profile + "/" : ""); InputStream in = getClass().getResourceAsStream(thisProfile + resource); if (in == null) { throw new IOException("profile '" + resource + "' not found"); } LangProfile langProfile = new LangProfile(); langProfile.read(in); addProfile(langProfile, index, langsize); } public void addProfile(LangProfile profile, int index, int langsize) throws IOException { String lang = profile.getName(); if (langlist.contains(lang)) { throw new IOException("duplicate of the same language profile: " + lang); } langlist.add(lang); for (String s : profile.getFreq().keySet()) { if (!wordLangProbMap.containsKey(s)) { wordLangProbMap.put(s, new double[langsize]); } int length = s.length(); if (length >= 1 && length <= 3) { double prob = profile.getFreq().get(s).doubleValue() / profile.getNWords().get(length - 1); wordLangProbMap.get(s)[index] = prob; } } } public String getProfile() { return profile; } public List<Language> detectAll(String text) throws LanguageDetectionException { if (!isStarted) { load(settings); init(); } List<Language> languages = new ArrayList<>(); if (filterPattern != null && !filterPattern.matcher(text).matches()) { return languages; } List<String> list = new ArrayList<>(); languages = sortProbability(languages, detectBlock(list, text)); return languages.subList(0, Math.min(languages.size(), settings.getAsInt("max", languages.size()))); } private double[] detectBlock(List<String> list, String string) throws LanguageDetectionException { // clean all non-work characters from text String text = string.replaceAll(word.pattern(), " "); extractNGrams(list, text); double[] langprob = new double[langlist.size()]; if (list.isEmpty()) { return langprob; } Random rand = new Random(); Long seed = 0L; rand.setSeed(seed); for (int t = 0; t < nTrial; ++t) { double[] prob = initProbability(); double a = this.alpha + rand.nextGaussian() * alphaWidth; for (int i = 0; ; ++i) { int r = rand.nextInt(list.size()); updateLangProb(prob, list.get(r), a); if (i % 5 == 0 && normalizeProb(prob) > convThreshold || i >= iterationLimit) { break; } } for (int j = 0; j < langprob.length; ++j) { langprob[j] += prob[j] / nTrial; } } return langprob; } private double[] initProbability() { double[] prob = new double[langlist.size()]; if (priorMap != null) { System.arraycopy(priorMap, 0, prob, 0, prob.length); } else { for (int i = 0; i < prob.length; ++i) { prob[i] = 1.0 / langlist.size(); } } return prob; } private void extractNGrams(List<String> list, String text) { NGram ngram = new NGram(); for (int i = 0; i < text.length(); ++i) { ngram.addChar(text.charAt(i)); for (int n = 1; n <= NGram.N_GRAM; ++n) { String w = ngram.get(n); if (w != null && wordLangProbMap.containsKey(w)) { list.add(w); } } } } private boolean updateLangProb(double[] prob, String word, double alpha) { if (word == null || !wordLangProbMap.containsKey(word)) { return false; } double[] langProbMap = wordLangProbMap.get(word); double weight = alpha / baseFreq; for (int i = 0; i < prob.length; ++i) { prob[i] *= weight + langProbMap[i]; } return true; } private double normalizeProb(double[] prob) { if (prob.length == 0) { return 0d; } double sump = prob[0]; for (int i = 1; i < prob.length; i++) { sump += prob[i]; } double maxp = 0d; for (int i = 0; i < prob.length; i++) { double p = prob[i] / sump; if (maxp < p) { maxp = p; } prob[i] = p; } return maxp; } private List<Language> sortProbability(List<Language> list, double[] prob) { for (int j = 0; j < prob.length; ++j) { double p = prob[j]; if (p > probThreshold) { for (int i = 0; i <= list.size(); ++i) { if (i == list.size() || list.get(i).getProbability() < p) { String code = langlist.get(j); if (langmap != null && langmap.containsKey(code)) { code = langmap.get(code); } list.add(i, new Language(code, p)); break; } } } } return list; } }