/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.language; import static java.nio.charset.StandardCharsets.UTF_8; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URISyntaxException; import org.apache.tika.exception.TikaException; import org.junit.After; import org.junit.Test; @Deprecated public class LanguageProfilerBuilderTest { /* Test members */ private LanguageProfilerBuilder ngramProfile = null; private LanguageProfile langProfile = null; private final String profileName = "../tika-core/src/test/resources/org/apache/tika/language/langbuilder/" + LanguageProfilerBuilderTest.class.getName(); private final String corpusName = "langbuilder/welsh_corpus.txt"; private final String FILE_EXTENSION = "ngp"; private final String LANGUAGE = "welsh"; private final int maxlen = 1000; @Test public void testCreateProfile() throws TikaException, IOException, URISyntaxException { try (InputStream is = LanguageProfilerBuilderTest.class.getResourceAsStream(corpusName)) { ngramProfile = LanguageProfilerBuilder.create(profileName, is, UTF_8.name()); } File f = new File(profileName + "." + FILE_EXTENSION); FileOutputStream fos = new FileOutputStream(f); ngramProfile.save(fos); fos.close(); assertEquals(maxlen, ngramProfile.getSorted().size()); } @Test public void testNGramProfile() throws IOException, TikaException, URISyntaxException { createLanguageProfile(); LanguageIdentifier.addProfile(LANGUAGE, langProfile); LanguageIdentifier identifier = new LanguageIdentifier(langProfile); assertEquals(LANGUAGE, identifier.getLanguage()); assertTrue(identifier.isReasonablyCertain()); } private void createLanguageProfile() throws IOException, TikaException, URISyntaxException { // Sort of dependency injection if (ngramProfile == null) testCreateProfile(); langProfile = new LanguageProfile(); try (InputStream stream = new FileInputStream(new File(profileName + "." + FILE_EXTENSION))) { BufferedReader reader = new BufferedReader(new InputStreamReader( stream, UTF_8)); String line = reader.readLine(); while (line != null) { if (line.length() > 0 && !line.startsWith("#")) {// skips the // ngp // header/comment int space = line.indexOf(' '); langProfile.add(line.substring(0, space), Long.parseLong(line.substring(space + 1))); } line = reader.readLine(); } } } @After public void tearDown() throws Exception { File profile = new File(profileName + "." + FILE_EXTENSION); if (profile.exists()) profile.delete(); } }