/*
* Copyright 2007 T-Rank AS
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package no.trank.openpipe.lang.step;
import java.nio.charset.Charset;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import no.trank.openpipe.api.BasePipelineStep;
import no.trank.openpipe.api.PipelineException;
import no.trank.openpipe.api.PipelineStepStatus;
import no.trank.openpipe.api.document.Document;
import no.trank.openpipe.config.annotation.NotEmpty;
import no.trank.openpipe.config.annotation.NullNotEmpty;
/**
* A pipeline step for identifying langauge of a field.
* <br/>
* Detected languages are: <tt>da</tt>, <tt>de</tt>, <tt>en</tt>, <tt>es</tt>, <tt>fr</tt>, <tt>it</tt>, <tt>nl</tt>,
* <tt>no</tt>, <tt>pt</tt> and <tt>sv</tt>.
* <br/>
* If a language cannot be detected, language is set to {@link #getDefaultLang() defaultLang}.
* <br/>
* Implementation uses ICU4J's {@link CharsetDetector} for detecting languages.
*
* @version $Revision$
*/
public class LanguageIdentifier extends BasePipelineStep {
private static final Logger log = LoggerFactory.getLogger(LanguageIdentifier.class);
private final CharsetDetector detector = new CharsetDetector();
private final Charset charset = Charset.forName("ISO-8859-1");
@NotEmpty
private String inputField;
@NotEmpty
private String langField = "language";
@NullNotEmpty
private String defaultLang;
private boolean overwrite;
private int minConfidence = 50;
/**
* Creates a step with <tt>"LanguageIdentifier"</tt> as name.
*/
public LanguageIdentifier() {
detector.setDeclaredEncoding(charset.name());
}
@Override
public PipelineStepStatus execute(Document doc) throws PipelineException {
final String text = doc.getFieldValue(inputField);
if (!overwrite && doc.getFieldValue(langField) != null) {
log.debug("Field '{}' already set to '{}'", langField, doc.getFieldValue(langField));
} else {
if (text != null) {
detector.setText(text.getBytes(charset));
final CharsetMatch match = detector.detect();
final int confidence = match.getConfidence();
if (confidence < minConfidence) {
log.debug("Confidence {} below minConfidence {}", confidence, minConfidence);
doc.setFieldValue(langField, defaultLang);
} else {
final String lang = match.getLanguage();
if (lang == null) {
log.debug("Confidence: {} but no language detected", confidence);
doc.setFieldValue(langField, defaultLang);
} else {
log.debug("Confidence: {} detected language {}", confidence, lang);
doc.setFieldValue(langField, lang);
}
}
}
}
return PipelineStepStatus.DEFAULT;
}
public String getInputField() {
return inputField;
}
public void setInputField(String inputField) {
this.inputField = inputField;
}
/**
* Gets the field name to put the detected language.
*
* @return the field name to put the detected language.
*/
public String getLangField() {
return langField;
}
/**
* Sets the field name to put the detected language.
*
* @param langField the field name to put the detected language.
*/
public void setLangField(String langField) {
this.langField = langField;
}
/**
* Gets the default language, default value is <tt>null</tt>.
*
* @return the default language.
*/
public String getDefaultLang() {
return defaultLang;
}
/**
* Sets the default language.
*
* @param defaultLang the new default language.
*/
public void setDefaultLang(String defaultLang) {
this.defaultLang = defaultLang;
}
/**
* Gets whether to overwrite field with name {@link #getLangField() langField} or not.
*
* @return <tt>true</tt> if field with name {@link #getLangField() langField} can be overwritten.
*/
public boolean isOverwrite() {
return overwrite;
}
/**
* Sets whether to overwrite field with name {@link #getLangField() langField} or not.
*
* @param overwrite <tt>true</tt> to overwrite.
*/
public void setOverwrite(boolean overwrite) {
this.overwrite = overwrite;
}
/**
* Gets the minimum confidence needed for a detected language. If confidence is <tt>< minConfidence</tt>
* {@link #getDefaultLang() defaultLang} is used.
*
* @return the minimum confidence needed for a detected language.
*/
public int getMinConfidence() {
return minConfidence;
}
/**
* Sets the minimum confidence needed for a detected language.
*
* @param minConfidence the minimum confidence needed for a detected language.
*/
public void setMinConfidence(int minConfidence) {
this.minConfidence = minConfidence;
}
@Override
public String getRevision() {
return "$Revision$";
}
@Override
public String toString() {
return "LanguageIdentifier{" +
"inputField='" + inputField + '\'' +
", langField='" + langField + '\'' +
", defaultLang='" + defaultLang + '\'' +
", overwrite=" + overwrite +
", minConfidence=" + minConfidence +
'}';
}
}