/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.olat.core.commons.services.text.impl.nutch; // JDK imports import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Enumeration; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Properties; import org.olat.core.commons.services.text.impl.nutch.NGramProfile; import org.olat.core.logging.OLog; import org.olat.core.logging.Tracing; import org.olat.core.commons.services.text.impl.nutch.NGramProfile.NGramEntry; /** * Identify the language of a content, based on statistical analysis.<br> * SR: Change to remove dependency to nutch, haadop and commons-logging * * @see <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639 * Language Codes</a> * * @author Sami Siren * @author Jérôme Charron */ public class LanguageIdentifier { private OLog log = Tracing.createLoggerFor(this.getClass()); private final static int DEFAULT_ANALYSIS_LENGTH = 0; // 0 means full content private List<NGramProfile> languages = new ArrayList<NGramProfile>(); private List<String> supportedLanguages = new ArrayList<String>(); /** Minimum size of NGrams */ private int minLength = NGramProfile.DEFAULT_MIN_NGRAM_LENGTH; /** Maximum size of NGrams */ private int maxLength = NGramProfile.DEFAULT_MAX_NGRAM_LENGTH; /** The maximum amount of data to analyze */ private int analyzeLength = DEFAULT_ANALYSIS_LENGTH; /** A global index of ngrams of all supported languages */ private Map<CharSequence, NGramEntry[]> ngramsIdx = new HashMap<CharSequence, NGramEntry[]>(); /** The NGramProfile used for identification */ private NGramProfile suspect = null; /** * Constructs a new Language Identifier. */ public LanguageIdentifier() { // Gets ngram sizes to take into account from the Nutch Config minLength = NGramProfile.DEFAULT_MIN_NGRAM_LENGTH; maxLength = NGramProfile.DEFAULT_MAX_NGRAM_LENGTH; // Ensure the min and max values are in an acceptale range // (ie min >= DEFAULT_MIN_NGRAM_LENGTH and max <= DEFAULT_MAX_NGRAM_LENGTH) maxLength = Math.min(maxLength, NGramProfile.ABSOLUTE_MAX_NGRAM_LENGTH); maxLength = Math.max(maxLength, NGramProfile.ABSOLUTE_MIN_NGRAM_LENGTH); minLength = Math.max(minLength, NGramProfile.ABSOLUTE_MIN_NGRAM_LENGTH); minLength = Math.min(minLength, maxLength); // Gets the value of the maximum size of data to analyze analyzeLength = DEFAULT_ANALYSIS_LENGTH; Properties p = new Properties(); try { p.load(this.getClass().getResourceAsStream("_resources/langmappings.properties")); Enumeration<Object> alllanguages = p.keys(); StringBuilder list = new StringBuilder("Language identifier plugin supports:"); Map<NGramEntry, List<NGramEntry>> tmpIdx = new HashMap<NGramEntry, List<NGramEntry>>(); while (alllanguages.hasMoreElements()) { String lang = (String) (alllanguages.nextElement()); InputStream is = this.getClass().getResourceAsStream("_resources/" + lang + "." + NGramProfile.FILE_EXTENSION); if (is != null) { NGramProfile profile = new NGramProfile(lang, minLength, maxLength); try { profile.load(is); languages.add(profile); supportedLanguages.add(lang); List<NGramEntry> ngrams = profile.getSorted(); for (int i=0; i<ngrams.size(); i++) { NGramEntry entry = ngrams.get(i); List<NGramEntry> registered = tmpIdx.get(entry); if (registered == null) { registered = new ArrayList<NGramEntry>(); tmpIdx.put(entry, registered); } registered.add(entry); entry.setProfile(profile); } list.append(" " + lang + "(" + ngrams.size() + ")"); is.close(); } catch (IOException e1) { log.error("", e1); } } } // transform all ngrams lists to arrays for performances Iterator<NGramEntry> keys = tmpIdx.keySet().iterator(); while (keys.hasNext()) { NGramEntry entry = keys.next(); List<NGramEntry> l = tmpIdx.get(entry); if (l != null) { NGramEntry[] array = l.toArray(new NGramEntry[l.size()]); ngramsIdx.put(entry.getSeq(), array); } } log.info(list.toString()); // Create the suspect profile suspect = new NGramProfile("suspect", minLength, maxLength); } catch (Exception e) { log.error("", e); } } /** * Identify language of a content. * * @param content is the content to analyze. * @return The 2 letter * <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639 * language code</a> (en, fi, sv, ...) of the language that best * matches the specified content. */ public String identify(String content) { return identify(new StringBuilder(content)); } /** * Identify language of a content. * * @param content is the content to analyze. * @return The 2 letter * <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639 * language code</a> (en, fi, sv, ...) of the language that best * matches the specified content. */ public String identify(StringBuilder content) { StringBuilder text = content; if ((analyzeLength > 0) && (content.length() > analyzeLength)) { text = new StringBuilder().append(content); text.setLength(analyzeLength); } suspect.analyze(text); Iterator<NGramEntry> iter = suspect.getSorted().iterator(); float topscore = Float.MIN_VALUE; String lang = ""; HashMap<NGramProfile, Float> scores = new HashMap<NGramProfile, Float>(); NGramEntry searched = null; while (iter.hasNext()) { searched = iter.next(); NGramEntry[] ngrams = ngramsIdx.get(searched.getSeq()); if (ngrams != null) { for (int j=0; j<ngrams.length; j++) { NGramProfile profile = ngrams[j].getProfile(); Float pScore = scores.get(profile); if (pScore == null) { pScore = new Float(0); } float plScore = pScore.floatValue(); plScore += ngrams[j].getFrequency() + searched.getFrequency(); scores.put(profile, new Float(plScore)); if (plScore > topscore) { topscore = plScore; lang = profile.getName(); } } } } return lang; } /** * Identify language from input stream. * This method uses the platform default encoding to read the input stream. * For using a specific encoding, use the * {@link #identify(InputStream, String)} method. * * @param is is the input stream to analyze. * @return The 2 letter * <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639 * language code</a> (en, fi, sv, ...) of the language that best * matches the content of the specified input stream. * @throws IOException if something wrong occurs on the input stream. */ public String identify(InputStream is) throws IOException { return identify(is, null); } /** * Identify language from input stream. * * @param is is the input stream to analyze. * @param charset is the charset to use to read the input stream. * @return The 2 letter * <a href="http://www.w3.org/WAI/ER/IG/ert/iso639.htm">ISO 639 * language code</a> (en, fi, sv, ...) of the language that best * matches the content of the specified input stream. * @throws IOException if something wrong occurs on the input stream. */ public String identify(InputStream is, String charset) throws IOException { ByteArrayOutputStream out = new ByteArrayOutputStream(); byte[] buffer = new byte[2048]; int len = 0; while (((len = is.read(buffer)) != -1) && ((analyzeLength == 0) || (out.size() < analyzeLength))) { if (analyzeLength != 0) { len = Math.min(len, analyzeLength - out.size()); } out.write(buffer, 0, len); } return identify((charset == null) ? out.toString() : out.toString(charset)); } }