/* * Copyright 2014 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.langdetect; import java.io.File; import java.io.IOException; import java.net.URL; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import com.cybozu.labs.langdetect.Detector; import com.cybozu.labs.langdetect.DetectorFactory; import com.cybozu.labs.langdetect.LangDetectException; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; /** * Langdetect language identifier based on character n-grams. */ public class LangDetectLanguageIdentifier extends JCasAnnotator_ImplBase { /** * Variant of a model the model. Used to address a specific model if here are multiple models * for one language. */ public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) protected String variant; /** * Location from which the model is read. */ public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) protected String modelLocation; private CasConfigurableProviderBase<File> modelProvider; @Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); modelProvider = new ModelProviderBase<File>() { { setContextObject(LangDetectLanguageIdentifier.this); setDefault(ARTIFACT_ID, "${groupId}.langdetect-model-${language}-${variant}"); setDefault(LOCATION, "classpath:/${package}/lib/languageidentifier-${language}-${variant}.properties"); setDefault(VARIANT, "wikipedia"); setOverride(LOCATION, modelLocation); setOverride(LANGUAGE, "any"); setOverride(VARIANT, variant); } @Override protected File produceResource(URL aUrl) throws IOException { try { DetectorFactory.clear(); File profileFolder = ResourceUtils.getClasspathAsFolder(aUrl.toString(), false); DetectorFactory.loadProfile(profileFolder); return profileFolder; } catch (LangDetectException e) { throw new IOException(e); } } }; } @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { modelProvider.configure(aJCas.getCas()); modelProvider.getResource(); String documentText = aJCas.getDocumentText(); String language = detectLanguage(documentText); aJCas.setDocumentLanguage(language); } private String detectLanguage(String aDocumentText) throws AnalysisEngineProcessException { String language = "x-unspecified"; try { Detector detector = DetectorFactory.create(); detector.append(aDocumentText); language = detector.detect(); } catch (LangDetectException e) { // "no features in text" might occur if a message composes for instance of a single // numeric value // we silently ignore this particular error message, but throw all other if (!isFeatureException(e)) { throw new AnalysisEngineProcessException(e); } } return language; } private boolean isFeatureException(LangDetectException e) { return e.getMessage().equals("no features in text"); } }