/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.stanbol.enhancer.engines.langdetect; import static org.junit.Assert.assertEquals; import static org.apache.stanbol.enhancer.test.helper.EnhancementStructureHelper.validateAllEntityAnnotations; import static org.apache.stanbol.enhancer.test.helper.EnhancementStructureHelper.validateAllTextAnnotations; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; import java.io.IOException; import java.io.InputStream; import java.util.Arrays; import java.util.HashMap; import org.apache.clerezza.rdf.core.LiteralFactory; import org.apache.clerezza.commons.rdf.RDFTerm; import org.apache.clerezza.commons.rdf.IRI; import org.apache.commons.io.IOUtils; import org.apache.stanbol.enhancer.contentitem.inmemory.InMemoryContentItemFactory; import org.apache.stanbol.enhancer.servicesapi.ContentItem; import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory; import org.apache.stanbol.enhancer.servicesapi.EngineException; import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine; import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper; import org.apache.stanbol.enhancer.servicesapi.impl.StringSource; import org.apache.stanbol.enhancer.servicesapi.rdf.Properties; import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Test; import org.osgi.service.cm.ConfigurationException; import org.osgi.service.component.ComponentContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.cybozu.labs.langdetect.Detector; import com.cybozu.labs.langdetect.DetectorFactory; import com.cybozu.labs.langdetect.LangDetectException; /** * {@link LanguageDetectionEngineTest} is a test class for {@link TextCategorizer}. * * @author Walter Kasper, DFKI */ public class LanguageDetectionEngineTest { private static final Logger LOG = LoggerFactory.getLogger(LanguageDetectionEngineTest.class); private static final ContentItemFactory ciFactory = InMemoryContentItemFactory.getInstance(); private static final String[] TEST_FILE_NAMES = {"en.txt","ja.txt","ko.txt","zh.txt"}; private static LanguageIdentifier langId; /** * This initializes the text categorizer. * @throws LangDetectException */ @BeforeClass public static void oneTimeSetUp() throws IOException, LangDetectException { langId = new LanguageIdentifier(); } /** * Tests the language identification. * * @throws IOException if there is an error when reading the text */ @Test public void testLangId() throws LangDetectException, IOException { LOG.info("Testing: {}", Arrays.asList(TEST_FILE_NAMES)); for (String file: TEST_FILE_NAMES) { String expectedLang = file.substring(0,2); InputStream in = LanguageDetectionEngineTest.class.getClassLoader().getResourceAsStream(file); assertNotNull("failed to load resource " + file, in); String text = IOUtils.toString(in, "UTF-8"); in.close(); String language = langId.getLanguage(text); if (!expectedLang.equals(language.substring(0,2))) { LOG.info("Expected: {}; Found {}",expectedLang,language); } assertEquals(expectedLang, language.substring(0,2)); } } /** * Test the engine and validates the created enhancements * @throws EngineException * @throws IOException * @throws ConfigurationException * @throws LangDetectException */ @Test public void testEngine() throws EngineException, ConfigurationException, LangDetectException, IOException { LOG.info("Testing engine: {}", TEST_FILE_NAMES[0]); InputStream in = LanguageDetectionEngineTest.class.getClassLoader().getResourceAsStream(TEST_FILE_NAMES[0]); assertNotNull("failed to load resource " + TEST_FILE_NAMES[0], in); String text = IOUtils.toString(in, "UTF-8"); in.close(); LanguageDetectionEnhancementEngine langIdEngine = new LanguageDetectionEnhancementEngine(); ComponentContext context = new MockComponentContext(); context.getProperties().put(EnhancementEngine.PROPERTY_NAME, "langdetect"); langIdEngine.activate(context); ContentItem ci = ciFactory.createContentItem(new StringSource(text)); langIdEngine.computeEnhancements(ci); HashMap<IRI,RDFTerm> expectedValues = new HashMap<IRI,RDFTerm>(); expectedValues.put(Properties.ENHANCER_EXTRACTED_FROM, ci.getUri()); expectedValues.put(Properties.DC_CREATOR, LiteralFactory.getInstance().createTypedLiteral( langIdEngine.getClass().getName())); int textAnnotationCount = validateAllTextAnnotations(ci.getMetadata(), text, expectedValues); assertTrue("A TextAnnotation is expected", textAnnotationCount > 0); //even through this tests do not validate detection quality //we expect the "en" is detected as best guess for the parsed text assertEquals("The detected language for text '"+text+"' MUST BE 'en'", "en",EnhancementEngineHelper.getLanguage(ci)); int entityAnnoNum = validateAllEntityAnnotations(ci.getMetadata(), expectedValues); assertEquals("No EntityAnnotations are expected",0, entityAnnoNum); } @Test public void testNonTextContent() throws EngineException, ConfigurationException, LangDetectException, IOException { LanguageDetectionEnhancementEngine langIdEngine = new LanguageDetectionEnhancementEngine(); ComponentContext context = new MockComponentContext(); context.getProperties().put(EnhancementEngine.PROPERTY_NAME, "langdetect"); langIdEngine.activate(context); ContentItem ci = ciFactory.createContentItem(new StringSource("123")); langIdEngine.computeEnhancements(ci); } }