/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.analysis.lang; // JDK imports import java.io.InputStream; import java.io.BufferedReader; import java.io.InputStreamReader; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.util.List; // JUnit imports import junit.framework.Test; import junit.framework.TestCase; import junit.framework.TestSuite; import junit.textui.TestRunner; // Lucene imports import org.apache.lucene.analysis.Token; import org.apache.nutch.analysis.lang.NGramProfile.NGramEntry; import org.apache.nutch.util.NutchConfiguration; /** * JUnit based test of class {@link LanguageIdentifier}. * * @author Sami Siren * @author Jerome Charron - http://frutch.free.fr/ */ public class TestLanguageIdentifier extends TestCase { public TestLanguageIdentifier(String testName) { super(testName); } public static Test suite() { return new TestSuite(TestLanguageIdentifier.class); } public static void main(String[] args) { TestRunner.run(suite()); } String tokencontent1 = "testaddtoken"; String tokencontent2 = "anotherteststring"; int[] counts1 = { 3, 2, 2, 1, 1, 1, 1, 1 }; String[] chars1 = { "t", "d", "e", "a", "k", "n", "o", "s" }; /** * Test addFromToken method * */ public void testAddToken() { NGramProfile p = new NGramProfile("test", 1, 1); Token t = new Token(tokencontent1, 0, tokencontent1.length()); p.add(t); p.normalize(); testCounts(p.getSorted(), counts1); testContents(p.getSorted(), chars1); } /** * Test analyze method */ public void testAnalyze() { String tokencontent = "testmeagain"; NGramProfile p = new NGramProfile("test", 1, 1); p.analyze(new StringBuilder(tokencontent)); //test that profile size is ok, eg 9 different NGramEntries "tesmagin" assertEquals(8, p.getSorted().size()); } /** * Test addNGrams method with StringBuffer argument * */ public void testAddNGramsStringBuffer() { String tokencontent = "testmeagain"; NGramProfile p = new NGramProfile("test", 1, 1); p.add(new StringBuffer(tokencontent)); //test that profile size is ok, eg 8 different NGramEntries "tesmagin" assertEquals(8, p.getSorted().size()); } /** * test getSorted method */ public void testGetSorted() { int[] count = { 4, 3, 1 }; String[] ngram = { "a", "b", "c" }; String teststring = "AAaaBbbC"; NGramProfile p = new NGramProfile("test", 1, 1); p.analyze(new StringBuilder(teststring)); //test size of profile assertEquals(3, p.getSorted().size()); testCounts(p.getSorted(), count); testContents(p.getSorted(), ngram); } public void testGetSimilarity() { NGramProfile a = new NGramProfile("a", 1, 1); NGramProfile b = new NGramProfile("b", 1, 1); a.analyze(new StringBuilder(tokencontent1)); b.analyze(new StringBuilder(tokencontent2)); //because of rounding errors might slightly return different results assertEquals(a.getSimilarity(b), b.getSimilarity(a), 0.0000002); } public void testExactMatch() { NGramProfile a = new NGramProfile("a", 1, 1); a.analyze(new StringBuilder(tokencontent1)); assertEquals(a.getSimilarity(a), 0, 0); } public void testIO() { //Create profile and set some contents NGramProfile a = new NGramProfile("a", 1, 1); a.analyze(new StringBuilder(this.tokencontent1)); NGramProfile b = new NGramProfile("a_from_inputstream", 1, 1); //save profile ByteArrayOutputStream os = new ByteArrayOutputStream(); try { a.save(os); os.close(); } catch (Exception e) { fail(); } //load profile InputStream is = new ByteArrayInputStream(os.toByteArray()); try { b.load(is); is.close(); } catch (Exception e) { fail(); } //check it testCounts(b.getSorted(), counts1); testContents(b.getSorted(), chars1); } private void testContents(List<NGramEntry> entries, String contents[]) { int c = 0; for (NGramEntry nge : entries) { assertEquals(contents[c], nge.getSeq().toString()); c++; } } private void testCounts(List<NGramEntry> entries, int counts[]) { int c = 0; for (NGramEntry nge : entries) { System.out.println(nge); assertEquals(counts[c], nge.getCount()); c++; } } public void testIdentify() { try { long total = 0; LanguageIdentifier idfr = new LanguageIdentifier(NutchConfiguration.create()); BufferedReader in = new BufferedReader(new InputStreamReader( this.getClass().getResourceAsStream("test-referencial.txt"))); String line = null; while((line = in.readLine()) != null) { String[] tokens = line.split(";"); if (!tokens[0].equals("")) { long start = System.currentTimeMillis(); // Identify the whole file String lang = idfr.identify(this.getClass().getResourceAsStream(tokens[0]), "UTF-8"); total += System.currentTimeMillis() - start; assertEquals(tokens[1], lang); // Then, each line of the file... BufferedReader testFile = new BufferedReader( new InputStreamReader( this.getClass().getResourceAsStream(tokens[0]), "UTF-8")); String testLine = null; while((testLine = testFile.readLine()) != null) { testLine = testLine.trim(); if (testLine.length() > 256) { lang = idfr.identify(testLine); assertEquals(tokens[1], lang); } } testFile.close(); } } in.close(); System.out.println("Total Time=" + total); } catch(Exception e) { e.printStackTrace(); fail(e.toString()); } } }