/* * Copyright 2010 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.textcat; import java.util.HashMap; import java.util.Map; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.jcas.JCas; import org.knallgrau.utils.textcat.TextCategorizer; /** * <p>Detection based on character n-grams. Uses the <a href="http://textcat.sourceforge.net">Java * Text Categorizing Library</a> based on a technique by Cavnar and Trenkle.</p> * * <p><b>References</b></p> * <ul> * <li>Cavnar, W. B. and J. M. Trenkle (1994). N-Gram-Based Text Categorization. * In Proceedings of Third Annual Symposium on Document Analysis and Information Retrieval, * Las Vegas, NV, UNLV Publications/Reprographics, pp. 161-175, 11-13 April 1994.</li></ul> */ public class LanguageIdentifier extends JCasAnnotator_ImplBase { private static final Map<String, String> langName2ISO = new HashMap<String, String>(); static { langName2ISO.put("german", "de"); langName2ISO.put("english", "en"); langName2ISO.put("french", "fr"); langName2ISO.put("spanish", "es"); langName2ISO.put("italian", "it"); langName2ISO.put("swedish", "sv"); langName2ISO.put("polish", "pl"); langName2ISO.put("dutch", "nl"); langName2ISO.put("norwegian", "no"); langName2ISO.put("finnish", "fi"); langName2ISO.put("albanian", "sq"); langName2ISO.put("slovakian", "sk"); langName2ISO.put("slovenian", "sl"); langName2ISO.put("danish", "da"); langName2ISO.put("hungarian", "hu"); } private final TextCategorizer categorizer = new TextCategorizer(); @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { String docText = aJCas.getDocumentText(); if (docText != null) { String result = categorizer.categorize(docText); aJCas.setDocumentLanguage(langName2ISO.get(result)); } } }