/**
* AnalyzerBeans
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.eobjects.analyzer.beans;
import java.util.Arrays;
import java.util.Map;
import java.util.Set;
import junit.framework.TestCase;
import org.eobjects.analyzer.data.InputColumn;
import org.eobjects.analyzer.data.InputRow;
import org.eobjects.analyzer.data.MockInputColumn;
import org.eobjects.analyzer.data.MockInputRow;
import org.eobjects.analyzer.result.AnnotatedRowsResult;
import org.eobjects.analyzer.result.CharacterSetDistributionResult;
import org.eobjects.analyzer.result.Crosstab;
import org.eobjects.analyzer.result.CrosstabNavigator;
import org.eobjects.analyzer.result.renderer.CrosstabTextRenderer;
import org.eobjects.analyzer.storage.InMemoryRowAnnotationFactory;
import org.apache.metamodel.util.EqualsBuilder;
import com.ibm.icu.text.UnicodeSet;
public class CharacterSetDistributionAnalyzerTest extends TestCase {
private static final String CHARSET_NAMES = "[Arabic, Armenian, Bengali, Cyrillic, Devanagari, Georgian, Greek, Gujarati, Gurmukhi, Han, Hangul, Hebrew, Hiragana, Kannada, Katakana, Latin, ASCII, Latin, non-ASCII, Malayalam, Oriya, Syriac, Tamil, Telugu, Thaana, Thai]";
public void testCreateFilters() throws Exception {
Map<String, UnicodeSet> unicodeSets = CharacterSetDistributionAnalyzer.createUnicodeSets();
Set<String> keys = unicodeSets.keySet();
assertEquals(CHARSET_NAMES, keys.toString());
UnicodeSet set = unicodeSets.get("Arabic");
assertFalse(set.contains('a'));
assertTrue(set.containsAll("البيانات"));
set = unicodeSets.get("Latin, ASCII");
assertTrue(set.contains('a'));
assertTrue(set.contains('z'));
assertFalse(set.contains('ä'));
assertFalse(set.contains('æ'));
set = unicodeSets.get("Latin, non-ASCII");
assertFalse(set.contains('a'));
assertFalse(set.contains('z'));
assertTrue(set.contains('ä'));
assertTrue(set.contains('æ'));
}
public void testSimpleScenario() throws Exception {
CharacterSetDistributionAnalyzer analyzer = new CharacterSetDistributionAnalyzer();
InputColumn<String> col1 = new MockInputColumn<String>("foo", String.class);
InputColumn<String> col2 = new MockInputColumn<String>("bar", String.class);
@SuppressWarnings("unchecked")
InputColumn<String>[] cols = new InputColumn[] { col1, col2 };
analyzer._columns = cols;
analyzer._annotationFactory = new InMemoryRowAnnotationFactory();
analyzer.init();
analyzer.run(new MockInputRow().put(col1, "foobar").put(col2, "foobar"), 10);
analyzer.run(new MockInputRow().put(col1, "DåtåClænør"), 1);
analyzer.run(new MockInputRow().put(col1, "Данныечистого"), 1);
analyzer.run(new MockInputRow().put(col1, "數據清潔"), 1);
analyzer.run(new MockInputRow().put(col1, "بيانات الأنظف"), 1);
analyzer.run(new MockInputRow().put(col1, "dữ liệu sạch hơn"), 1);
CharacterSetDistributionResult result = analyzer.getResult();
assertTrue(EqualsBuilder.equals(analyzer._columns, result.getColumns()));
assertEquals(CHARSET_NAMES, Arrays.toString(result.getUnicodeSetNames()));
Crosstab<?> crosstab = result.getCrosstab();
assertEquals("[Column, Measures]", Arrays.toString(crosstab.getDimensionNames()));
assertEquals(CHARSET_NAMES, crosstab.getDimension("Measures").getCategories().toString());
CrosstabNavigator<?> cyrillicNavigation = crosstab.navigate().where("Column", "foo").where("Measures", "Cyrillic");
assertEquals("1", cyrillicNavigation.get().toString());
AnnotatedRowsResult cyrillicAnnotatedRowsResult = (AnnotatedRowsResult) cyrillicNavigation.explore().getResult();
InputRow[] annotatedRows = cyrillicAnnotatedRowsResult.getRows();
assertEquals(1, annotatedRows.length);
assertEquals("Данныечистого", annotatedRows[0].getValue(col1));
assertEquals("12", crosstab.navigate().where("Column", "foo").where("Measures", "Latin, ASCII").get().toString());
assertEquals("2", crosstab.navigate().where("Column", "foo").where("Measures", "Latin, non-ASCII").get().toString());
String resultString = new CrosstabTextRenderer().render(result);
String[] resultLines = resultString.split("\n");
assertEquals(25, resultLines.length);
assertEquals(" foo bar ", resultLines[0]);
assertEquals("Arabic 1 0 ", resultLines[1]);
assertEquals("Armenian 0 0 ", resultLines[2]);
assertEquals("Bengali 0 0 ", resultLines[3]);
assertEquals("Cyrillic 1 0 ", resultLines[4]);
assertEquals("Devanagari 0 0 ", resultLines[5]);
assertEquals("Georgian 0 0 ", resultLines[6]);
assertEquals("Greek 0 0 ", resultLines[7]);
assertEquals("Gujarati 0 0 ", resultLines[8]);
assertEquals("Gurmukhi 0 0 ", resultLines[9]);
assertEquals("Han 1 0 ", resultLines[10]);
assertEquals("Hangul 0 0 ", resultLines[11]);
assertEquals("Hebrew 0 0 ", resultLines[12]);
assertEquals("Hiragana 0 0 ", resultLines[13]);
assertEquals("Kannada 0 0 ", resultLines[14]);
assertEquals("Katakana 0 0 ", resultLines[15]);
assertEquals("Latin, ASCII 12 10 ", resultLines[16]);
assertEquals("Latin, non-ASCII 2 0 ", resultLines[17]);
assertEquals("Malayalam 0 0 ", resultLines[18]);
assertEquals("Oriya 0 0 ", resultLines[19]);
assertEquals("Syriac 0 0 ", resultLines[20]);
assertEquals("Tamil 0 0 ", resultLines[21]);
assertEquals("Telugu 0 0 ", resultLines[22]);
assertEquals("Thaana 0 0 ", resultLines[23]);
assertEquals("Thai 0 0 ", resultLines[24]);
}
}