/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.langdetect; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; import java.io.IOException; import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.Map; import org.apache.tika.io.IOUtils; import org.apache.tika.language.detect.LanguageConfidence; import org.apache.tika.language.detect.LanguageDetector; import org.apache.tika.language.detect.LanguageResult; import org.apache.tika.language.detect.LanguageWriter; import org.junit.Test; public class OptimaizeLangDetectorTest extends LanguageDetectorTest { /* * The complete list of supported languages (as of 0.5) is below. * The ones we have tests for have '*' after the name. * af Afrikaans an Aragonese ar Arabic ast Asturian be Belarusian br Breton ca Catalan bg Bulgarian bn Bengali cs Czech cy Welsh da Danish * de German * el Greek * en English * es Spanish * et Estonian eu Basque fa Persian fi Finnish * fr French * ga Irish gl Galician gu Gujarati he Hebrew hi Hindi hr Croatian ht Haitian hu Hungarian id Indonesian is Icelandic it Italian * ja Japanese * km Khmer kn Kannada ko Korean lt Lithuanian * lv Latvian mk Macedonian ml Malayalam mr Marathi ms Malay mt Maltese ne Nepali nl Dutch * no Norwegian oc Occitan pa Punjabi pl Polish pt Portuguese * ro Romanian ru Russian sk Slovak sl Slovene so Somali sq Albanian sr Serbian sv Swedish * sw Swahili ta Tamil te Telugu th Thai * tl Tagalog tr Turkish uk Ukrainian ur Urdu vi Vietnamese yi Yiddish zh-CN Simplified Chinese * (just generic Chinese) zh-TW Traditional Chinese * (just generic Chinese) */ /** * Test correct detection for the many (short) translations of the * "Universal Declaration of Human Rights (Article 1)", at * http://www.omniglot.com/udhr * * Also make sure we get uncertain results for some set of unsupported * languages. * * @throws Exception */ @Test public void testUniversalDeclarationOfHumanRights() throws Exception { LanguageDetector detector = new OptimaizeLangDetector(); detector.loadModels(); LanguageWriter writer = new LanguageWriter(detector); Map<String, String> knownText = getTestLanguages("udhr-known.txt"); for (String language : knownText.keySet()) { writer.reset(); writer.append(knownText.get(language)); LanguageResult result = detector.detect(); assertNotNull(result); assertEquals(language, result.getLanguage()); // System.out.println(String.format("'%s': %s (%f)", language, result.getConfidence(), result.getRawScore())); } Map<String, String> unknownText = getTestLanguages("udhr-unknown.txt"); for (String language : unknownText.keySet()) { writer.reset(); writer.append(unknownText.get(language)); LanguageResult result = detector.detect(); if (result != null) { assertFalse(result.isReasonablyCertain()); // System.out.println(String.format("Looking for '%s', got '%s': %s (%f)", language, result.getLanguage(), result.getConfidence(), result.getRawScore())); } } writer.close(); } @Test public void testAllLanguages() throws IOException { LanguageDetector detector = new OptimaizeLangDetector(); detector.loadModels(); LanguageWriter writer = new LanguageWriter(detector); for (String language : getTestLanguages()) { writer.reset(); writeTo(language, writer); LanguageResult result = detector.detect(); assertNotNull(result); assertTrue(result.isLanguage(language)); assertTrue(result.isReasonablyCertain()); } } @Test public void testMixedLanguages() throws IOException { LanguageDetector detector = new OptimaizeLangDetector() .setMixedLanguages(true); detector.loadModels(); LanguageWriter writer = new LanguageWriter(detector); String[] languages = getTestLanguages(); for (int i = 0; i < languages.length; i++) { String language = languages[i]; for (int j = i + 1; j < languages.length; j++) { String other = languages[j]; writer.reset(); writeTo(language, writer); writeTo(other, writer); List<LanguageResult> results = detector.detectAll(); if (results.size() > 0) { LanguageResult result = results.get(0); assertFalse("mix of " + language + " and " + other + " incorrectly detected as " + result, result.isReasonablyCertain()); } } } writer.close(); } @Test public void testShortText() throws IOException { LanguageDetector detector = new OptimaizeLangDetector() .setShortText(true) .loadModels(); // First verify that we get no result with empty or very short text. LanguageWriter writer = new LanguageWriter(detector); writer.append(""); assertEquals(LanguageConfidence.NONE, detector.detect().getConfidence()); writer.reset(); writer.append(" "); assertEquals(LanguageConfidence.NONE, detector.detect().getConfidence()); for (String language : getTestLanguages()) { // Short pieces of Japanese are detected as Chinese if (language.equals("ja")) { continue; } // We need at least 300 characters to detect Chinese reliably. writer.reset(); writeTo(language, writer, 300); LanguageResult result = detector.detect(); assertNotNull(String.format(Locale.US, "Language '%s' wasn't detected", language), result); assertTrue(String.format(Locale.US, "Language '%s' was detected as '%s'", language, result.getLanguage()), result.isLanguage(language)); assertTrue(String.format(Locale.US, "Language '%s' isn't reasonably certain: %s", language, result.getConfidence()), result.isReasonablyCertain()); } writer.close(); } private Map<String, String> getTestLanguages(String resourceName) throws IOException { Map<String, String> result = new HashMap<>(); List<String> languages = IOUtils.readLines(OptimaizeLangDetectorTest.class.getResourceAsStream(resourceName)); for (String line : languages) { line = line.trim(); if (line.isEmpty() || line.startsWith("#")) { continue; } String[] pieces = line.split("\t", 2); if (pieces.length != 2) { throw new IllegalArgumentException("Invalid language data line: " + line); } result.put(pieces[0], pieces[1]); } return result; } }