//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.stats;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.DocumentAnnotation;
import org.apache.uima.resource.ResourceInitializationException;
import com.google.common.base.Optional;
import com.google.common.collect.ImmutableSet;
import com.optimaize.langdetect.LanguageDetector;
import com.optimaize.langdetect.LanguageDetectorBuilder;
import com.optimaize.langdetect.i18n.LdLocale;
import com.optimaize.langdetect.ngram.NgramExtractors;
import com.optimaize.langdetect.profiles.LanguageProfile;
import com.optimaize.langdetect.profiles.LanguageProfileReader;
import com.optimaize.langdetect.text.CommonTextObjectFactories;
import com.optimaize.langdetect.text.TextObject;
import com.optimaize.langdetect.text.TextObjectFactory;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.uima.BaleenAnnotator;
/**
* Sets the document language using Language Detector library.
*
* <p>Uses the Language Detector library to identify the language of the document from a random sample of N-grams.
* If the language can't be detected, then <i>x-unspecified</i> is returned.</p>
*
*
*/
public class DocumentLanguage extends BaleenAnnotator {
private LanguageDetector languageDetector;
private TextObjectFactory textObjectFactory;
@Override
public void doInitialize(UimaContext aContext) throws ResourceInitializationException {
try{
List<LanguageProfile> languageProfiles = new LanguageProfileReader().readAllBuiltIn();
languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
.withProfiles(languageProfiles)
.build();
textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
}catch(IOException ioe){
throw new ResourceInitializationException(ioe);
}
}
@Override
public void doProcess(JCas aJCas) throws AnalysisEngineProcessException {
TextObject textObject = textObjectFactory.forText(aJCas.getDocumentText());
Optional<LdLocale> lang = languageDetector.detect(textObject);
if(lang.isPresent()){
aJCas.setDocumentLanguage(lang.get().getLanguage());
}
}
@Override
public void doDestroy(){
textObjectFactory = null;
languageDetector = null;
}
@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(DocumentAnnotation.class));
}
}