/**
* DataCleaner (community edition)
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.datacleaner.cli;
import java.io.File;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.io.FileUtils;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
public class SampleJobsIT {
@Rule
public TemporaryFolder tempFolder = new TemporaryFolder();
@Test
public void testCopyEmployeesToCustomerTable() throws Exception {
final Map<String, String[]> expectedResultSets = new HashMap<>();
expectedResultSets.put("RESULT: orderdb - CUSTOMERS (19 columns)", new String[] { "23 inserts executed" });
testJob("Copy employees to customer table", expectedResultSets);
}
@Test
public void testCustomerAgeAnalysis() throws Exception {
final Map<String, String[]> expectedResultSets = new HashMap<>();
// TODO: Because of https://github.com/datacleaner/DataCleaner/issues/1596 we don't know the exact
// pattern for the second match and therefore check it less strict then desirable. Revert it to a more
// strict check once that issue has been fixed.
expectedResultSets.put("RESULT: Birthdate Patterns (birthdate)",
new String[] { "Match count Sample", "####-##-## 4661", "## 427",
"a/a 27" });
expectedResultSets.put("RESULT: Number analyzer (Age in years)",
new String[] { "Age in years", "Row count 5115", "Null count 29",
"Highest value", "Lowest value", "Sum", "Mean", "Geometric mean",
// Really 21.04, but precision can make make the result change a little
"Standard deviation 21.0", "Variance", "Second moment", "Sum of squares", "Median",
"25th percentile", "75th percentile", "Skewness", "Kurtosis" });
// TODO: Because of https://github.com/datacleaner/DataCleaner/issues/1589 we don't know the exact
// number produced by the next resultset and therefore check it less strict then desirable. Fill in
// the exact numbers when fixing that issue.
expectedResultSets.put("RESULT: Month distribution (birthdate (as date))",
new String[] { "birthdate (as date)", "January", "February", "March", "April", "May", "June", "July",
"August", "September", "October", "November", "December" });
testJob("Customer age analysis", expectedResultSets);
}
@Test
public void testCustomerFilter() throws Exception {
final Map<String, String[]> expectedResultSets = new HashMap<>();
expectedResultSets
.put("RESULT: output-Customers-correct-data.csv (15 columns) (70.0 =< Age in years =< 80.0=LOWER)",
new String[] { "inserts executed" });
expectedResultSets.put("RESULT: output-Customers-age-null-or-invalid.csv (15 columns) "
+ "(FilterOutcome[category=HIGHER] OR FilterOutcome[category=NULL])",
new String[] { "inserts executed" });
testJob("Customer filtering", expectedResultSets);
}
@Test
public void testCustomerProfiling() throws Exception {
final Map<String, String[]> expectedResultSets = new HashMap<>();
expectedResultSets.put("RESULT: Unique key check (id)",
new String[] { "Unique key check result:", "- Row count: 5115", "- Null count: 0",
"- Unique count: 5100", "- Non-unique count: 15" });
expectedResultSets.put("RESULT: Currencies (income_currency)",
new String[] { "SingleValueDistributionResult:", "- Distinct count: 6", "- Null count: 0",
"- Total count: 5115", "- Unique count: 0", "- Value count (GBP): 1771",
"- Value count (EUR): 1511", "- Value count (USD): 1271", "- Value count (DKR): 513",
"- Value count (<blank>): 0", "- Value count (n/a): 12" });
expectedResultSets.put("RESULT: Gender matcher (gender)",
new String[] { "ValueMatchAnalyzerResult:", "- Null count: 0", "- Unexpected value count: 67",
"- Value count (M): 2483", "- Value count (F): 2122", "- Value count (U): 443" });
expectedResultSets.put("RESULT: Address completeness (address_line,post_code,city,country)",
new String[] { "CompletenessAnalyzerResult:", "- Row count: 5115", "- Valid row count: 5053",
"- Invalid row count: 62" });
expectedResultSets.put("RESULT: Character set distribution (given_name,family_name,company)",
new String[] { "given_name family_name company",
"Arabic 0 0 0",
"Armenian 0 0 0",
"Bengali 0 0 0",
"Cyrillic 0 0 0",
"Devanagari 0 0 0",
"Georgian 0 0 0",
"Greek 3 3 0",
"Gujarati 0 0 0",
"Gurmukhi 0 0 0",
"Han 4 4 0",
"Hangul 0 0 0",
"Hebrew 0 0 0",
"Hiragana 0 0 0",
"Kannada 0 0 0",
"Katakana 0 0 0",
"Latin, ASCII 5104 5106 5108",
"Latin, non-ASCII 59 198 48",
"Malayalam 0 0 0",
"Oriya 0 0 0",
"Syriac 0 0 0",
"Tamil 0 0 0",
"Telugu 0 0 0",
"Thaana 0 0 0",
"Thai 0 0 0" });
expectedResultSets.put("RESULT: Names completeness (given_name,family_name,company)",
new String[] { "CompletenessAnalyzerResult:", "- Row count: 5115", "- Valid row count: 5103",
"- Invalid row count: 12" });
testJob("Customer profiling", expectedResultSets);
}
@Test
public void testDenormalizeOrderTotalsAndPresentAsStackedAreaChart() throws Exception {
final Map<String, String[]> expectedResultSets = new HashMap<>();
expectedResultSets.put("RESULT: Stacked area plot (13 columns)",
new String[] { "JavaStackedAreaAnalyzerResult:", "(no metrics)" });
testJob("Denormalize order totals and present as stacked area chart", expectedResultSets);
}
@Test
public void testExportOfOrdersDataMart() throws Exception {
final Map<String, String[]> expectedResultSets = new HashMap<>();
expectedResultSets
.put("RESULT: orders_w_dimensions.csv (10 columns)", new String[] { "2996 inserts executed" });
expectedResultSets.put("RESULT: Lookup customers",
new String[] { "CategorizationResult:", "- Category count (Match):", "- Category count (Miss): 0",
"- Category count (Cached):" });
// Because the Country standardizer renders its results in a random order, we can't validate the
// actual number for each country.
expectedResultSets.put("RESULT: Country standardizer",
new String[] { "CountryStandardizationResult:", "- Category count (", "- Category count (",
"- Category count (", "- Category count (", "- Category count (", "- Category count (",
"- Category count (", "- Category count (", "- Category count (", "- Category count (",
"- Category count (", "- Category count (", "- Category count (", "- Category count (",
"- Category count (", "- Category count (", "- Category count (", "- Category count (",
"- Category count (", "- Category count (", "- Category count (" });
expectedResultSets.put("RESULT: Lookup products",
new String[] { "CategorizationResult:", "- Category count (Match):", "- Category count (Miss): 0",
"- Category count (Cached):" });
testJob("Export of Orders data mart", expectedResultSets);
}
@Test
public void testJobTitleAnalytics() throws Exception {
final Map<String, String[]> expectedResultSets = new HashMap<>();
expectedResultSets.put("RESULT: Unrecognized job titles (Trimmed job title)",
new String[] { "SingleValueDistributionResult:", "- Distinct count: 172", "- Null count: 0",
"- Total count: 1739", "- Unique count: 5", "- Value count (Data architect): 103",
"- Value count (Sr.): 61", "- Value count (Software engineer): 59",
"- Value count (Data analyst): 58", "- Value count (Business analyst): 57",
"- Value count (Project manager): 52", "- Value count (Senior consultant): 45",
"- Value count (Bi consultant): 35", "- Value count (It manager): 32",
"- Value count (Dr.): 30", "- Value count (Ing.): 30", "- Value count (Ingeniero): 30",
"- Value count (Bi architect): 29", "- Value count (N/a): 28", "- Value count (Associate): 24",
"- Value count (Managing director): 24", "- Value count (Solution architect): 24",
"- Value count (Software developer): 21", "- Value count (Data manager): 20",
"- Value count (Product manager): 20", "- Value count (Bi manager): 19",
"- Value count (Data warehouse architect): 19", "- Value count (Ms.): 19",
"- Value count (Sr. Consultant): 19", "- Value count (Enterprise architect): 18",
"- Value count (Information architect): 18", "- Value count (Senior data architect): 16",
"- Value count (Technical architect): 16", "- Value count (Etl developer): 15",
"- Value count (Sql developer): 15", "- Value count (Database manager): 14",
"- Value count (Systems engineer): 14", "- Value count (Bi specialist): 13",
"- Value count (Data scientist): 13", "- Value count (Development manager): 12",
"- Value count (Director of technology): 12", "- Value count (Ing sistemas): 12",
"- Value count (It consultant): 12", "- Value count (Mr.): 12",
"- Value count (Senior software engineer): 12", "- Value count (System analyst): 12",
"- Value count (It director): 11", "- Value count (Web developer): 11",
"- Value count (Application developer): 10", "- Value count (Bi developer): 10",
"- Value count (Dhr.): 10", "- Value count (It specialist): 10",
"- Value count (Sr business analyst): 10", "- Value count (Head of data management): 9",
"- Value count (Solutions architect): 9", "- Value count (Chief architect): 8",
"- Value count (Contractor): 8", "- Value count (Data consultant): 8",
"- Value count (Data specialist): 8", "- Value count (Dq consultant): 8",
"- Value count (Employee): 8", "- Value count (Etudiant): 8",
"- Value count (General manager): 8", "- Value count (Graphic designer): 8",
"- Value count (Marketing specialist): 8", "- Value count (Network admin): 8",
"- Value count (Senior developer): 8", "- Value count (Senior director): 8",
"- Value count (Software architect): 8", "- Value count (Vp technology): 8",
"- Value count (Digital marketing manager): 7",
"- Value count (Information systems engineer): 7", "- Value count (Ingeniera): 7",
"- Value count (It administrator): 7", "- Value count (It architect): 7",
"- Value count (Proprietor): 7", "- Value count (Staff engineer): 7",
"- Value count (Technical lead): 7", "- Value count (Avp): 6", "- Value count (Bsa): 6",
"- Value count (Crm manager): 6", "- Value count (D): 6",
"- Value count (Data quality manager): 6", "- Value count (Database administrator): 6",
"- Value count (Eng.): 6", "- Value count (Enginner): 6",
"- Value count (Jefe de proyectos): 6", "- Value count (Qq): 6",
"- Value count (Senior engineer): 6", "- Value count (Sir): 6",
"- Value count (Sqa manager): 6", "- Value count (Sr consultant): 6",
"- Value count (Sr. Data architect): 6", "- Value count (Sr. Software engineer): 6",
"- Value count (System engineer): 6", "- Value count (Bi director): 5",
"- Value count (Business data analyst): 5", "- Value count (Engneer): 5",
"- Value count (It support): 5", "- Value count (Kkk): 5",
"- Value count (Marketing director): 5", "- Value count (Monsieur): 5",
"- Value count (Principal consultant): 5", "- Value count (Programmer/analyst): 5",
"- Value count (Project lead): 5", "- Value count (R): 5", "- Value count (Sr manager): 5",
"- Value count (Technical consultant): 5", "- Value count (Admin): 4",
"- Value count (Advisor): 4", "- Value count (Analista de sistemas): 4",
"- Value count (Data and information quality specialist): 4",
"- Value count (Data governance manager): 4", "- Value count (Data warehouse designer): 4",
"- Value count (Database architect): 4", "- Value count (Ea): 4",
"- Value count (It-consultant): 4", "- Value count (Lead): 4", "- Value count (Lecturer): 4",
"- Value count (Leon): 4", "- Value count (Manage): 4", "- Value count (Managing partner): 4",
"- Value count (Mgr): 4", "- Value count (Network administrator): 4",
"- Value count (Qa analyst): 4", "- Value count (Rd): 4", "- Value count (Research officer): 4",
"- Value count (Sdf): 4", "- Value count (Senior analyst): 4",
"- Value count (Senior associate): 4", "- Value count (Sr. Data analyst): 4",
"- Value count (Sr. Manager): 4", "- Value count (Sys admin): 4",
"- Value count (System admin): 4", "- Value count (Systems architect): 4",
"- Value count (Administrador): 3", "- Value count (Assistant manager): 3",
"- Value count (Bi analyst): 3", "- Value count (Business development): 3",
"- Value count (C): 3", "- Value count (Computer engineer): 3",
"- Value count (Data cleanser): 3", "- Value count (Data quality analyst): 3",
"- Value count (Developper): 3", "- Value count (Director informatics): 3",
"- Value count (Director of business intelligence): 3", "- Value count (Estudante): 3",
"- Value count (Gerente): 3", "- Value count (Head of marketing): 3",
"- Value count (Manager bi): 3", "- Value count (Market analyst): 3",
"- Value count (Master): 3", "- Value count (Miner): 3", "- Value count (Oracle dba): 3",
"- Value count (Personal): 3", "- Value count (Program manager): 3",
"- Value count (Senior specialist): 3", "- Value count (Sm): 3",
"- Value count (Specialist): 3", "- Value count (The man): 3", "- Value count (Abcd): 2",
"- Value count (Hello): 2", "- Value count (It analyst): 2",
"- Value count (Managing consultant): 2", "- Value count (Mdm analyst): 2",
"- Value count (Mrs.): 2", "- Value count (Pepe): 2", "- Value count (Project leader): 2",
"- Value count (Sales engineer): 2", "- Value count (Senior business analyst): 2",
"- Value count (Solution director): 2", "- Value count (Tech arch): 2" });
expectedResultSets
.put("RESULT: Job title distribution (Recognized job title) (Is job title recognized?=NOT_NULL)",
new String[] { "SingleValueDistributionResult:", "- Distinct count: 14", "- Null count: 0",
"- Total count: 3376", "- Unique count: 0", "- Value count ((dirt)): 585",
"- Value count (Consultant): 578", "- Value count (Executive): 548",
"- Value count (Student/Intern): 467", "- Value count (Software developer): 439",
"- Value count ((honorific)): 247", "- Value count (Manager): 181",
"- Value count (Analyst): 147", "- Value count (Database administrator): 105",
"- Value count (Academic): 45", "- Value count (Independent): 15",
"- Value count (Data warehouse developer): 10", "- Value count (Sales): 5",
"- Value count (Marketeer): 4" });
testJob("Job title analytics", expectedResultSets);
}
@Test
public void testOrderDBCustomersAndEmployeesUnion() throws Exception {
final Map<String, String[]> expectedResultSets = new HashMap<>();
expectedResultSets.put("RESULT: Unique person identifier check (CUSTOMERNUMBER)",
new String[] { "Unique key check result:", "- Row count: 236", "- Null count: 0", "- Unique count: 230",
"- Non-unique count: 6" });
expectedResultSets
.put("RESULT: Completeness analyzer (CUSTOMERNUMBER,CONTACTFIRSTNAME,CONTACTLASTNAME,JOBTITLE)",
new String[] { "CompletenessAnalyzerResult:", "- Row count: 237", "- Valid row count: 236",
"- Invalid row count: 1" });
expectedResultSets
.put("RESULT: orderdb-people - incomplete-records (JOBTITLE,CUSTOMERNUMBER,CONTACTFIRSTNAME,CONTACTLASTNAME)",
new String[] { "1 inserts executed" });
testJob("OrderDB Customers and Employees union", expectedResultSets);
}
@Test
public void testUSCustomerSTATECheck() throws Exception {
final Map<String, String[]> expectedResultSets = new HashMap<>();
expectedResultSets.put("RESULT: Country Completeness (COUNTRY)",
new String[] { "CompletenessAnalyzerResult:", "- Row count: 214", "- Valid row count: 203",
"- Invalid row count: 11" });
expectedResultSets
.put("RESULT: US-customers-without-state.csv (CUSTOMERNUMBER,CUSTOMERNAME) (STATE is null?=NULL)",
new String[] { "11 inserts executed" });
expectedResultSets.put("RESULT: US state distribution (STATE) (STATE is null?=NOT_NULL)",
new String[] { "SingleValueDistributionResult:", "- Distinct count: 10", "- Null count: 0",
"- Total count: 41", "- Unique count: 4", "- Value count (CA): 13", "- Value count (MA): 8",
"- Value count (NY): 6", "- Value count (CT): 4", "- Value count (California): 3",
"- Value count (PA): 3" });
testJob("US Customer STATE check", expectedResultSets);
}
private void testJob(final String jobName, final Map<String, String[]> expectedResultSets) throws Exception {
final File jobTempRepoFolder = this.tempFolder.newFolder();
FileUtils.copyDirectory(new File("src/main/resources/datacleaner-home/"), jobTempRepoFolder);
JobTestHelper.testJob(jobTempRepoFolder, jobName, expectedResultSets);
}
}