/*
* Copyright 2015 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.springframework.xd.analytics.linguistics.langdetect;
import static org.springframework.util.StringUtils.*;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.lang.reflect.Field;
import java.util.*;
import com.cybozu.labs.langdetect.Detector;
import com.cybozu.labs.langdetect.DetectorFactory;
import com.cybozu.labs.langdetect.LangDetectException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.core.io.DefaultResourceLoader;
import org.springframework.core.io.Resource;
import org.springframework.util.CollectionUtils;
import org.springframework.util.FileCopyUtils;
import org.springframework.util.ReflectionUtils;
import org.springframework.xd.tuple.Tuple;
import org.springframework.xd.tuple.TupleBuilder;
/**
* A processor that can predict the language of a piece of text extracted from a {@link org.springframework.xd.tuple.Tuple}.
* <p>
* Language prediction is supported for short and long texts via different language models.
* Note since the {@code langdetect} library holds the language model as static state one
* can only deal with one model instance per {@link java.lang.ClassLoader} at a time.
* </p>
* The langdetect library has the following characteristics.
* <ul>
* <li>Generate language profiles from Wikipedia abstract xml</li>
* <li>Detect language of a text using naive Bayesian filter</li>
* <li>99% over precision for 53 languages</li>
* </ul>
* More details can be found the the <a href="https://code.google.com/p/language-detection/wiki/FrequentlyAskedQuestion">langdetect FAQ</a>.
*
* @author Thomas Darimont
*/
public class LanguageDetector implements InitializingBean {
private static final Logger LOG = LoggerFactory.getLogger(LanguageDetector.class);
private String languageProfileLocation;
private TextModel textModel;
private String inputTextContentPropertyName;
private String mostLikelyLanguageOutputPropertyName;
private String languageProbabilitiesOutputPropertyName;
private boolean returnMostLikelyLanguage;
private boolean returnLanguageProbabilities;
private boolean deterministicLanguageDetection;
private String languagePriorities;
//we need to use concrete types here since Detector requires them :-(
private HashMap<String,Double> languagePriorityMap;
private DetectorFactoryState detectorFactoryState;
private LanguagePriorityParser languagePriorityParser = new LanguagePriorityParser();
/**
* Performs the language prediction based on text extracted from the given {@link org.springframework.xd.tuple.Tuple}.
* <p>
* The text used for the language prediction is extracted via the {@link LanguageDetector#inputTextContentPropertyName}
* from the given {@code Tuple}.
* </p>
*
* @param input the {@code Tuple} to extract the text from
* @return a new {@code Tuple} with the predicted language information.
* @throws LangDetectException
*/
public Tuple process(Tuple input) throws LangDetectException {
if (!isLanguageDetectionEnabled()) {
return input;
}
String text = input.getString(getInputTextContentPropertyName());
Detector detector = newDetector(this.detectorFactoryState);
detector.append(text);
List<String> names = new ArrayList<String>();
List<Object> values = new ArrayList<Object>();
names.addAll(input.getFieldNames());
values.addAll(input.getValues());
if (isReturnMostLikelyLanguage()) {
names.add(getMostLikelyLanguageOutputPropertyName());
values.add(detector.detect());
}
if (isReturnLanguageProbabilities()) {
names.add(getLanguageProbabilitiesOutputPropertyName());
values.add(detector.getProbabilities());
}
return TupleBuilder.tuple().ofNamesAndValues(names, values);
}
/**
* This creates a new {@link com.cybozu.labs.langdetect.Detector} instance.
* We have to create the instance ourselves to avoid problems with the shared state inside {@link com.cybozu.labs.langdetect.DetectorFactory}.
*
* @param detectorFactoryState
* @return
*/
private Detector newDetector(DetectorFactoryState detectorFactoryState) throws LangDetectException {
Detector detector = new Detector(detectorFactoryState.getWordLangProbMap(), detectorFactoryState.getLanguageList(), detectorFactoryState.getSeed());
if (!CollectionUtils.isEmpty(languagePriorityMap)) {
detector.setPriorMap(languagePriorityMap);
}
return detector;
}
private boolean isLanguageDetectionEnabled() {
return isReturnMostLikelyLanguage() || isReturnLanguageProbabilities();
}
@Override
public void afterPropertiesSet() throws Exception {
this.languagePriorityMap = new HashMap<String,Double>(languagePriorityParser.parseToLanguagePriorityMap(languagePriorities));
loadLanguageProfiles();
this.detectorFactoryState = captureDetectorFactoryState();
LOG.info("Loaded language profiles from {}.", languageProfileLocation);
}
private void loadLanguageProfiles() throws LangDetectException, IOException {
if (isEmpty(this.languageProfileLocation)) {
LOG.info("Using embedded language profiles.");
loadEmbeddedLangaugeProfiles();
return;
}
LOG.info("Using language profiles from {}.", languageProfileLocation);
loadExternalLanguageProfiles();
}
private void loadExternalLanguageProfiles() throws LangDetectException, IOException {
Resource languageProfileResource = new DefaultResourceLoader(getClass().getClassLoader()).getResource(languageProfileLocation);
DetectorFactory.loadProfile(languageProfileResource.getFile());
}
private void loadEmbeddedLangaugeProfiles() throws LangDetectException {
DetectorFactory.loadProfile(extractEmbeddedLanguageModels());
}
/**
* Captures the current state of the {@link com.cybozu.labs.langdetect.DetectorFactory} to make sure
* that state cannot be overridden by concurrent initializations later on.
* <p>This is necessary since, the state in the {@code DetectorFactory} is stored globally.</p>
*
* @return
*/
@SuppressWarnings("unchecked")
private DetectorFactoryState captureDetectorFactoryState() {
Field detectorFactoryInstanceField = ReflectionUtils.findField(DetectorFactory.class, "instance_");
ReflectionUtils.makeAccessible(detectorFactoryInstanceField);
Field detectorFactoryWordLangProbMapField = ReflectionUtils.findField(DetectorFactory.class, "wordLangProbMap");
ReflectionUtils.makeAccessible(detectorFactoryWordLangProbMapField);
DetectorFactory instance = (DetectorFactory) ReflectionUtils.getField(detectorFactoryInstanceField, null);
HashMap<String, double[]> wordLangProbMap = (HashMap<String, double[]>) ReflectionUtils.getField(detectorFactoryWordLangProbMapField, instance);
ArrayList<String> languageList = new ArrayList<>(DetectorFactory.getLangList());
return new DetectorFactoryState(languageList, wordLangProbMap, isDeterministicLanguageDetection() ? 0L : null);
}
private List<String> extractEmbeddedLanguageModels() {
List<String> languageModels = new ArrayList<String>();
Set<String> supportedLanguages = new TreeSet<String>();
for (Locale locale : Locale.getAvailableLocales()) {
if (locale.getLanguage().isEmpty()) {
continue;
}
supportedLanguages.add(locale.getLanguage().toLowerCase());
}
//added these manually since they were not present in the available Locales.
supportedLanguages.add("zh-cn");
supportedLanguages.add("zh-tw");
for (String lang : supportedLanguages) {
try (InputStream is = DetectorFactory.class.getClassLoader().getResourceAsStream("profiles/" + getTextModel().name().toLowerCase() + "/" + lang)) {
String json = FileCopyUtils.copyToString(new InputStreamReader(is));
languageModels.add(json);
} catch (Exception ex) {
continue;
}
}
return languageModels;
}
public String getLanguageProfileLocation() {
return languageProfileLocation;
}
public void setLanguageProfileLocation(String languageProfileLocation) {
this.languageProfileLocation = languageProfileLocation;
}
public TextModel getTextModel() {
return textModel;
}
public void setTextModel(TextModel textModel) {
this.textModel = textModel;
}
public String getInputTextContentPropertyName() {
return inputTextContentPropertyName;
}
public void setInputTextContentPropertyName(String inputTextContentPropertyName) {
this.inputTextContentPropertyName = inputTextContentPropertyName;
}
public String getMostLikelyLanguageOutputPropertyName() {
return mostLikelyLanguageOutputPropertyName;
}
public void setMostLikelyLanguageOutputPropertyName(String mostLikelyLanguageOutputPropertyName) {
this.mostLikelyLanguageOutputPropertyName = mostLikelyLanguageOutputPropertyName;
}
public String getLanguageProbabilitiesOutputPropertyName() {
return languageProbabilitiesOutputPropertyName;
}
public void setLanguageProbabilitiesOutputPropertyName(String languageProbabilitiesOutputPropertyName) {
this.languageProbabilitiesOutputPropertyName = languageProbabilitiesOutputPropertyName;
}
public boolean isReturnMostLikelyLanguage() {
return returnMostLikelyLanguage;
}
public void setReturnMostLikelyLanguage(boolean returnMostLikelyLanguage) {
this.returnMostLikelyLanguage = returnMostLikelyLanguage;
}
public boolean isReturnLanguageProbabilities() {
return returnLanguageProbabilities;
}
public void setReturnLanguageProbabilities(boolean returnLanguageProbabilities) {
this.returnLanguageProbabilities = returnLanguageProbabilities;
}
public boolean isDeterministicLanguageDetection() {
return deterministicLanguageDetection;
}
public void setDeterministicLanguageDetection(boolean deterministicLanguageDetection) {
this.deterministicLanguageDetection = deterministicLanguageDetection;
}
public String getLanguagePriorities() {
return languagePriorities;
}
public void setLanguagePriorities(String languagePriorities) {
this.languagePriorities = languagePriorities;
}
/**
* Holds the configured state of the {@link com.cybozu.labs.langdetect.DetectorFactory} at construction time to avoid
* sudden value changes in between. This is necessary since the DetectorFactory state is stored on a global singleton.
*/
static class DetectorFactoryState {
private final ArrayList<String> languageList;
private final HashMap<String, double[]> wordLangProbMap;
private final Long seed;
public DetectorFactoryState(ArrayList<String> languageList, HashMap<String, double[]> wordLangProbMap, Long seed) {
this.languageList = languageList;
this.wordLangProbMap = wordLangProbMap;
this.seed = seed;
}
public ArrayList<String> getLanguageList() {
return languageList;
}
public HashMap<String, double[]> getWordLangProbMap() {
return wordLangProbMap;
}
public Long getSeed() {
return seed;
}
}
}