/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.language;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Writer;
import java.util.HashMap;
import java.util.Locale;
import org.apache.tika.io.IOUtils;
import org.junit.Before;
import org.junit.Test;
/**
* JUnit based test of class {@link LanguageIdentifier}.
*
* @author Sami Siren
* @author Jerome Charron - http://frutch.free.fr/
*/
@Deprecated
public class LanguageIdentifierTest {
private static final String[] languages = new String[] {
// TODO - currently Estonian and Greek fail these tests.
// Enable when language detection works better.
"da", "de", /* "et", "el", */ "en", "es", "fi", "fr", "it",
"lt", "nl", "pt", "sv"
};
@Before
public void setUp() {
LanguageIdentifier.initProfiles();
}
@Test
public void testLanguageDetection() throws IOException {
for (String language : languages) {
ProfilingWriter writer = new ProfilingWriter();
writeTo(language, writer);
LanguageIdentifier identifier = null;
identifier = new LanguageIdentifier(writer.getProfile());
assertEquals(language, identifier.getLanguage());
// Lithuanian is detected but isn't reasonably certain:
if (!language.equals("lt")) {
assertTrue(identifier.toString(), identifier.isReasonablyCertain());
}
}
}
@Test
public void testClearAddAndInitProfiles() throws IOException {
// Prepare english and german language profiles
ProfilingWriter enWriter = new ProfilingWriter();
writeTo("en", enWriter);
LanguageProfile enProfile = enWriter.getProfile();
ProfilingWriter deWriter = new ProfilingWriter();
writeTo("de", deWriter);
LanguageProfile deProfile = deWriter.getProfile();
// Out of the box profiles
LanguageIdentifier identifier = null;
identifier = new LanguageIdentifier(enProfile);
assertEquals("en", identifier.getLanguage());
assertTrue(identifier.isReasonablyCertain());
// No profiles
LanguageIdentifier.clearProfiles();
identifier = new LanguageIdentifier(enProfile);
assertFalse(identifier.isReasonablyCertain());
// Only English profile
LanguageIdentifier.addProfile("en", enProfile);
identifier = new LanguageIdentifier(enProfile);
assertEquals("en", identifier.getLanguage());
assertTrue(identifier.isReasonablyCertain());
// English and German profiles loaded explicitly from initProfiles method
HashMap<String, LanguageProfile> profilesMap = new HashMap<String, LanguageProfile>();
profilesMap.put("en", enProfile);
profilesMap.put("de", deProfile);
LanguageIdentifier.initProfiles(profilesMap);
identifier = new LanguageIdentifier(enProfile);
assertEquals("en", identifier.getLanguage());
assertTrue(identifier.isReasonablyCertain());
identifier = new LanguageIdentifier(deProfile);
assertEquals("de", identifier.getLanguage());
assertTrue(identifier.isReasonablyCertain());
}
// Enable this to compare performance
public void testPerformance() throws IOException {
final int MRUNS = 8;
final int IRUNS = 10;
int detected = 0; // To avoid code removal by JVM or compiler
String lastResult = null;
for (int m = 0 ; m < MRUNS ; m++) {
LanguageProfile.useInterleaved = (m & 1) == 1; // Alternate between standard and interleaved
String currentResult = "";
final long start = System.nanoTime();
for (int i = 0 ; i < IRUNS ; i++) {
for (String language : languages) {
ProfilingWriter writer = new ProfilingWriter();
writeTo(language, writer);
LanguageIdentifier identifier = new LanguageIdentifier(writer.getProfile());
if (identifier.isReasonablyCertain()) {
currentResult += identifier.getLanguage();
detected++;
}
}
}
System.out.println(String.format(Locale.ROOT,
"Performed %d detections at %2d ms/test with interleaved=%b",
languages.length*IRUNS, (System.nanoTime()-start)/1000000/(languages.length*IRUNS),
LanguageProfile.useInterleaved));
if (lastResult != null) { // Might as well test that they behave the same while we're at it
assertEquals("This result should be equal to the last", lastResult, currentResult);
}
lastResult = currentResult;
}
if (detected == -1) {
System.out.println("Never encountered but keep it to guard against over-eager optimization");
}
}
@Test
public void testMixedLanguages() throws IOException {
for (String language : languages) {
for (String other : languages) {
if (!language.equals(other)) {
if (language.equals("lt") || other.equals("lt")) {
continue;
}
ProfilingWriter writer = new ProfilingWriter();
writeTo(language, writer);
writeTo(other, writer);
LanguageIdentifier identifier = null;
identifier = new LanguageIdentifier(writer.getProfile());
assertFalse("mix of " + language + " and " + other + " incorrectly detected as " + identifier, identifier.isReasonablyCertain());
}
}
}
}
// TIKA-453: Fix up language identifier used for Estonian
@Test
public void testEstonia() throws Exception {
final String estonian = "et";
ProfilingWriter writer = new ProfilingWriter();
writeTo(estonian, writer);
LanguageIdentifier identifier =
new LanguageIdentifier(writer.getProfile());
assertEquals(estonian, identifier.getLanguage());
}
private void writeTo(String language, Writer writer) throws IOException {
try (InputStream stream =
LanguageIdentifierTest.class.getResourceAsStream(
language + ".test")) {
IOUtils.copy(new InputStreamReader(stream, UTF_8), writer);
}
}
}