/**
* DataCleaner (community edition)
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.datacleaner.configuration;
import static org.hamcrest.Matchers.is;
import static org.junit.Assert.assertThat;
import java.io.File;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang.ArrayUtils;
import org.apache.metamodel.DataContext;
import org.apache.metamodel.csv.CsvConfiguration;
import org.apache.metamodel.data.DataSet;
import org.apache.metamodel.hbase.HBaseConfiguration;
import org.apache.metamodel.schema.Column;
import org.apache.metamodel.schema.Schema;
import org.apache.metamodel.schema.Table;
import org.apache.metamodel.util.ExclusionPredicate;
import org.apache.metamodel.util.Predicate;
import org.apache.metamodel.util.SimpleTableDef;
import org.datacleaner.api.RenderingFormat;
import org.datacleaner.connection.CassandraDatastore;
import org.datacleaner.connection.CouchDbDatastore;
import org.datacleaner.connection.CsvDatastore;
import org.datacleaner.connection.DataHubDatastore;
import org.datacleaner.connection.Datastore;
import org.datacleaner.connection.DatastoreCatalog;
import org.datacleaner.connection.DatastoreConnection;
import org.datacleaner.connection.ElasticSearchDatastore;
import org.datacleaner.connection.FixedWidthDatastore;
import org.datacleaner.connection.HBaseDatastore;
import org.datacleaner.connection.JdbcDatastore;
import org.datacleaner.connection.JsonDatastore;
import org.datacleaner.connection.MongoDbDatastore;
import org.datacleaner.connection.PojoDatastore;
import org.datacleaner.connection.UpdateableDatastoreConnection;
import org.datacleaner.connection.XmlDatastore;
import org.datacleaner.descriptors.ClasspathScanDescriptorProvider;
import org.datacleaner.descriptors.CompositeDescriptorProvider;
import org.datacleaner.descriptors.DescriptorProvider;
import org.datacleaner.descriptors.Descriptors;
import org.datacleaner.descriptors.RendererBeanDescriptor;
import org.datacleaner.job.concurrent.SingleThreadedTaskRunner;
import org.datacleaner.lifecycle.LifeCycleHelper;
import org.datacleaner.metamodel.datahub.DataHubSecurityMode;
import org.datacleaner.reference.DatastoreDictionary;
import org.datacleaner.reference.DatastoreSynonymCatalog;
import org.datacleaner.reference.Dictionary;
import org.datacleaner.reference.DictionaryConnection;
import org.datacleaner.reference.ReferenceDataCatalog;
import org.datacleaner.reference.StringPattern;
import org.datacleaner.reference.StringPatternConnection;
import org.datacleaner.reference.SynonymCatalog;
import org.datacleaner.reference.SynonymCatalogConnection;
import org.datacleaner.reference.TextFileDictionary;
import org.datacleaner.reference.TextFileSynonymCatalog;
import org.datacleaner.result.renderer.HtmlRenderingFormat;
import org.datacleaner.result.renderer.TextRenderingFormat;
import org.datacleaner.server.HadoopClusterInformation;
import org.datacleaner.storage.BerkeleyDbStorageProvider;
import org.datacleaner.storage.CombinedStorageProvider;
import org.datacleaner.storage.InMemoryRowAnnotationFactory2;
import org.datacleaner.storage.InMemoryStorageProvider;
import org.datacleaner.storage.RowAnnotationFactory;
import org.datacleaner.storage.StorageProvider;
import org.junit.Assert;
import junit.framework.TestCase;
public class JaxbConfigurationReaderTest extends TestCase {
private final JaxbConfigurationReader reader = new JaxbConfigurationReader();
private DatastoreCatalog _datastoreCatalog;
public void testReadCsvFilesWithSpecialCharacters() throws Exception {
final DataCleanerConfiguration configuration =
reader.create(new File("src/test/resources/example-configuration-csv-with-special-chars.xml"));
CsvDatastore csv = (CsvDatastore) configuration.getDatastoreCatalog().getDatastore("csv");
assertTrue("Unexpected separator: " + csv.getSeparatorChar(), '\t' == csv.getSeparatorChar());
assertTrue("Unexpected escape: " + csv.getEscapeChar(), CsvConfiguration.NOT_A_CHAR == csv.getEscapeChar());
assertTrue(csv.isMultilineValues());
csv = (CsvDatastore) configuration.getDatastoreCatalog().getDatastore("csv_quot");
assertEquals("\"", csv.getQuoteChar().toString());
assertFalse(csv.isMultilineValues());
}
public void testReadClasspathScannerWithExcludedRenderer() throws Exception {
final DataCleanerConfiguration configuration = reader.create(
new File("src/test/resources/example-configuration-classpath-scanner-with-exclusions.xml"));
final DescriptorProvider descriptorProvider = configuration.getEnvironment().getDescriptorProvider();
assertTrue(descriptorProvider instanceof CompositeDescriptorProvider);
final ClasspathScanDescriptorProvider scanner =
((CompositeDescriptorProvider) descriptorProvider).findClasspathScanProvider();
final Predicate<Class<? extends RenderingFormat<?>>> predicate = scanner.getRenderingFormatPredicate();
assertNotNull(predicate);
assertTrue(predicate instanceof ExclusionPredicate);
Collection<RendererBeanDescriptor<?>> renderers =
descriptorProvider.getRendererBeanDescriptorsForRenderingFormat(TextRenderingFormat.class);
assertTrue(renderers.isEmpty());
renderers = descriptorProvider.getRendererBeanDescriptorsForRenderingFormat(HtmlRenderingFormat.class);
assertFalse(renderers.isEmpty());
}
public void testReadComplexDataInPojoDatastore() throws Exception {
final DataCleanerConfiguration configuration = reader.create(
new File("src/test/resources/example-configuration-pojo-datastore-with-complex-data.xml"));
final Datastore datastore = configuration.getDatastoreCatalog().getDatastore("pojo");
assertNotNull(datastore);
final DatastoreConnection con = datastore.openConnection();
final DataContext dc = con.getDataContext();
final Table table = dc.getDefaultSchema().getTable(0);
final Column[] columns = table.getColumns();
assertEquals("[Column[name=Foo,columnNumber=0,type=VARCHAR,nullable=true,nativeType=null,columnSize=null], "
+ "Column[name=Bar,columnNumber=1,type=MAP,nullable=true,nativeType=null,columnSize=null], "
+ "Column[name=Baz,columnNumber=2,type=LIST,nullable=true,nativeType=null,columnSize=null], "
+ "Column[name=bytes,columnNumber=3,type=BINARY,nullable=true,nativeType=null,columnSize=null]]",
Arrays.toString(columns));
final DataSet ds = dc.query().from(table).select(columns).execute();
assertTrue(ds.next());
assertEquals("Hello", ds.getRow().getValue(0).toString());
assertEquals("{greeting=hello, person=world}", ds.getRow().getValue(1).toString());
assertEquals("[hello, world]", ds.getRow().getValue(2).toString());
assertEquals("{1,2,3,4,5}", ArrayUtils.toString(ds.getRow().getValue(3)));
assertTrue(ds.getRow().getValue(1) instanceof Map);
assertTrue(ds.getRow().getValue(2) instanceof List);
assertTrue(ds.getRow().getValue(3) instanceof byte[]);
assertTrue(ds.next());
assertEquals("There", ds.getRow().getValue(0).toString());
assertEquals("{greeting=hi, there you!, person={Firstname=Kasper, Lastname=Sørensen}}",
ds.getRow().getValue(1).toString());
assertEquals(null, ds.getRow().getValue(2));
assertEquals(null, ds.getRow().getValue(3));
assertTrue(ds.getRow().getValue(1) instanceof Map);
assertTrue(ds.next());
assertEquals("World", ds.getRow().getValue(0).toString());
assertEquals(null, ds.getRow().getValue(1));
assertEquals("[Sørensen, Kasper]", ds.getRow().getValue(2).toString());
assertEquals("{-1,-2,-3,-4,-5}", ArrayUtils.toString(ds.getRow().getValue(3)));
assertTrue(ds.getRow().getValue(2) instanceof List);
assertTrue(ds.getRow().getValue(3) instanceof byte[]);
}
public void testOverrideVariables() throws Exception {
System.setProperty("datastoreCatalog.myDatabase.username", "foobar");
System.setProperty("datastoreCatalog.persons_csv.filename", "foo/bar.csv");
try {
final DataCleanerConfiguration configuration =
reader.create(new File("src/test/resources/example-configuration-valid.xml"));
Datastore datastore = configuration.getDatastoreCatalog().getDatastore("my database");
assertTrue(datastore instanceof JdbcDatastore);
final String username = ((JdbcDatastore) datastore).getUsername();
assertEquals("foobar", username);
datastore = configuration.getDatastoreCatalog().getDatastore("persons_csv");
assertTrue(datastore instanceof CsvDatastore);
final String filename = ((CsvDatastore) datastore).getFilename();
assertEquals("foo/bar.csv", filename);
} finally {
System.getProperties().remove("datastoreCatalog.myDatabase.username");
System.getProperties().remove("datastoreCatalog.persons_csv.filename");
}
}
public void testValidConfiguration() throws Exception {
final DataCleanerConfiguration configuration =
reader.create(new File("src/test/resources/example-configuration-valid.xml"));
final DatastoreCatalog datastoreCatalog = getDataStoreCatalog(configuration);
assertEquals("[composite_datastore, my database, mydb_jndi, mydb_neo4j, persons_csv]",
Arrays.toString(datastoreCatalog.getDatastoreNames()));
assertTrue(configuration.getEnvironment().getTaskRunner() instanceof SingleThreadedTaskRunner);
}
public void testCombinedStorage() throws Exception {
final DataCleanerConfiguration configuration =
reader.create(new File("src/test/resources/example-configuration-combined-storage.xml"));
final StorageProvider storageProvider = configuration.getEnvironment().getStorageProvider();
assertEquals(CombinedStorageProvider.class, storageProvider.getClass());
final CombinedStorageProvider csp = (CombinedStorageProvider) storageProvider;
assertEquals(BerkeleyDbStorageProvider.class, csp.getCollectionsStorageProvider().getClass());
assertEquals(InMemoryStorageProvider.class, csp.getRowAnnotationsStorageProvider().getClass());
final RowAnnotationFactory rowAnnotationFactory =
csp.getRowAnnotationsStorageProvider().createRowAnnotationFactory();
assertEquals(InMemoryRowAnnotationFactory2.class, rowAnnotationFactory.getClass());
}
public void testAllDatastoreTypes() throws Exception {
final DatastoreCatalog datastoreCatalog = getDataStoreCatalog(getConfiguration());
final String[] datastoreNames = datastoreCatalog.getDatastoreNames();
assertEquals(
"[my cassandra db, my couch, my es index, my hbase, my mongo, my_access, my_composite, my_csv, "
+ "my_custom, my_datahub, my_dbase, my_dom_xml, my_excel_2003, my_fixed_width_1, "
+ "my_fixed_width_2, my_jdbc_connection, my_jdbc_datasource, my_json, my_odb, my_pojo, "
+ "my_sas, my_sax_xml, my_sfdc_ds, my_sugarcrm]", Arrays.toString(datastoreNames));
assertEquals("a mongo db based datastore", datastoreCatalog.getDatastore("my mongo").getDescription());
assertEquals("jdbc_con", datastoreCatalog.getDatastore("my_jdbc_connection").getDescription());
assertEquals("jdbc_ds", datastoreCatalog.getDatastore("my_jdbc_datasource").getDescription());
assertEquals("dbf", datastoreCatalog.getDatastore("my_dbase").getDescription());
final CsvDatastore myCsvDatastore = (CsvDatastore) datastoreCatalog.getDatastore("my_csv");
assertEquals("csv", myCsvDatastore.getDescription());
assertTrue(myCsvDatastore.isMultilineValues());
assertTrue(myCsvDatastore.isFailOnInconsistencies());
assertEquals('\\', myCsvDatastore.getEscapeChar().charValue());
final CassandraDatastore cassandraDatastore =
(CassandraDatastore) datastoreCatalog.getDatastore("my cassandra db");
assertEquals("localhost", cassandraDatastore.getHostname());
assertEquals(9042, cassandraDatastore.getPort());
assertEquals("my_keyspace", cassandraDatastore.getKeyspace());
assertEquals("foo", cassandraDatastore.getUsername());
assertEquals("bar", cassandraDatastore.getPassword());
assertEquals("[SimpleTableDef[name=table,columnNames=[bah, baz],columnTypes=[STRING, STRING]]]",
Arrays.toString(cassandraDatastore.getTableDefs()));
final ElasticSearchDatastore esDatastore =
(ElasticSearchDatastore) datastoreCatalog.getDatastore("my es index");
assertEquals("localhost", esDatastore.getHostname());
assertEquals(new Integer(9300), esDatastore.getPort());
assertEquals("my_es_cluster", esDatastore.getClusterName());
assertEquals("my_index", esDatastore.getIndexName());
assertNull(esDatastore.getTableDefs());
assertEquals("a SugarCRM instance", datastoreCatalog.getDatastore("my_sugarcrm").getDescription());
assertEquals("dom xml", datastoreCatalog.getDatastore("my_dom_xml").getDescription());
assertEquals("sax xml", datastoreCatalog.getDatastore("my_sax_xml").getDescription());
assertEquals("custom", datastoreCatalog.getDatastore("my_custom").getDescription());
assertEquals("odb", datastoreCatalog.getDatastore("my_odb").getDescription());
assertEquals("xls", datastoreCatalog.getDatastore("my_excel_2003").getDescription());
assertEquals("comp", datastoreCatalog.getDatastore("my_composite").getDescription());
assertEquals("salesforce.com is an online CRM system",
datastoreCatalog.getDatastore("my_sfdc_ds").getDescription());
assertEquals("mdb", datastoreCatalog.getDatastore("my_access").getDescription());
assertEquals("folder of sas7bdat files", datastoreCatalog.getDatastore("my_sas").getDescription());
assertEquals("A datastore based on plain values", datastoreCatalog.getDatastore("my_pojo").getDescription());
final PojoDatastore pojoDatastore = (PojoDatastore) datastoreCatalog.getDatastore("my_pojo");
{
try (UpdateableDatastoreConnection con = pojoDatastore.openConnection()) {
final DataContext dc = con.getDataContext();
final Schema schema = dc.getDefaultSchema();
assertEquals("my_schema", schema.getName());
assertEquals(2, schema.getTableCount());
assertEquals("[table1, table2]", Arrays.toString(schema.getTableNames()));
assertEquals(
"[Column[name=Foo,columnNumber=0,type=VARCHAR,nullable=true,nativeType=null,columnSize=null], "
+ "Column[name=Bar,columnNumber=1,type=INTEGER,nullable=true,nativeType=null,columnSize=null]]",
Arrays.toString(schema.getTable(0).getColumns()));
assertEquals(
"[Column[name=Baz,columnNumber=0,type=BOOLEAN,nullable=true,nativeType=null,columnSize=null]]",
Arrays.toString(schema.getTable(1).getColumns()));
try (DataSet ds = dc.query().from("table1").select("Foo", "Bar").execute()) {
assertTrue(ds.next());
assertEquals("Row[values=[Hello, 1]]", ds.getRow().toString());
assertEquals(String.class, ds.getRow().getValue(0).getClass());
assertEquals(Integer.class, ds.getRow().getValue(1).getClass());
assertTrue(ds.next());
assertEquals("Row[values=[There, null]]", ds.getRow().toString());
assertNull(ds.getRow().getValue(1));
}
try (DataSet ds = dc.query().from("table2").select("Baz").execute()) {
assertTrue(ds.next());
assertEquals("Row[values=[true]]", ds.getRow().toString());
assertEquals(Boolean.class, ds.getRow().getValue(0).getClass());
}
}
}
final CouchDbDatastore couchDbDatastore = (CouchDbDatastore) datastoreCatalog.getDatastore("my couch");
assertEquals("localhost", couchDbDatastore.getHostname());
assertEquals("user", couchDbDatastore.getUsername());
assertEquals("pass", couchDbDatastore.getPassword());
assertEquals(true, couchDbDatastore.isSslEnabled());
assertEquals(1, couchDbDatastore.getTableDefs().length);
assertEquals("SimpleTableDef[name=foobar,columnNames=[foo, bar, baz],columnTypes=[MAP, INTEGER, VARCHAR]]",
couchDbDatastore.getTableDefs()[0].toString());
final MongoDbDatastore mongoDbDatastore = (MongoDbDatastore) datastoreCatalog.getDatastore("my mongo");
assertEquals("analyzerbeans_test", mongoDbDatastore.getDatabaseName());
assertEquals("localhost", mongoDbDatastore.getHostname());
assertEquals(27017, mongoDbDatastore.getPort());
SimpleTableDef[] tableDefs = mongoDbDatastore.getTableDefs();
assertEquals("[SimpleTableDef[name=my_col_1,columnNames=[foo, bar, baz],columnTypes=[VARCHAR, INTEGER, DATE]]]",
Arrays.toString(tableDefs));
final XmlDatastore xmlDatastore = (XmlDatastore) datastoreCatalog.getDatastore("my_sax_xml");
assertEquals("../core/src/test/resources/example-xml-file.xml", xmlDatastore.getFilename());
assertEquals("[XmlSaxTableDef[rowXpath=/greetings/greeting,"
+ "valueXpaths=[/greetings/greeting/how, /greetings/greeting/what]]]",
Arrays.toString(xmlDatastore.getTableDefs()));
FixedWidthDatastore ds = (FixedWidthDatastore) datastoreCatalog.getDatastore("my_fixed_width_1");
assertEquals(19, ds.getFixedValueWidth());
assertEquals("[]", Arrays.toString(ds.getValueWidths()));
assertEquals(0, ds.getHeaderLineNumber());
ds = (FixedWidthDatastore) datastoreCatalog.getDatastore("my_fixed_width_2");
assertEquals(-1, ds.getFixedValueWidth());
assertEquals("[4, 17, 19]", Arrays.toString(ds.getValueWidths()));
assertEquals(1, ds.getHeaderLineNumber());
final HBaseDatastore hbaseDatastore = (HBaseDatastore) datastoreCatalog.getDatastore("my hbase");
assertEquals("HBaseDatastore[name=my hbase]", hbaseDatastore.toString());
assertEquals("localhost", hbaseDatastore.getZookeeperHostname());
assertEquals(HBaseConfiguration.DEFAULT_ZOOKEEPER_PORT, hbaseDatastore.getZookeeperPort());
tableDefs = hbaseDatastore.getTableDefs();
assertNotNull(tableDefs);
assertEquals(2, tableDefs.length);
assertEquals(
"SimpleTableDef[name=table1,columnNames=[fam1:foo, fam1:bar, fam2:baz],columnTypes=[STRING, STRING, INTEGER]]",
tableDefs[0].toString());
assertEquals("SimpleTableDef[name=table2,columnNames=[fam3:hello, fam3:world],columnTypes=[STRING, VARCHAR]]",
tableDefs[1].toString());
final JsonDatastore jsonDatastore = (JsonDatastore) datastoreCatalog.getDatastore("my_json");
assertEquals("JsonDatastore[name=my_json]", jsonDatastore.toString());
final DataHubDatastore dataHubDatastore = (DataHubDatastore) datastoreCatalog.getDatastore("my_datahub");
assertThat(dataHubDatastore.getName(), is("my_datahub"));
assertThat(dataHubDatastore.getHost(), is("hostname"));
assertThat(dataHubDatastore.getPort(), is(1234));
assertThat(dataHubDatastore.getUsername(), is("user"));
assertThat(dataHubDatastore.getPassword(), is("SECRET"));
assertThat(dataHubDatastore.isHttps(), is(false));
assertThat(dataHubDatastore.isAcceptUnverifiedSslPeers(), is(false));
assertThat(dataHubDatastore.getSecurityMode(), is(DataHubSecurityMode.DEFAULT));
for (final String name : datastoreNames) {
// test that all connections, except the JNDI-, MongoDB- and
// CouchDB-based on will work
if (!"my_jdbc_datasource".equals(name) && !"my mongo".equals(name) && !"my couch".equals(name)
&& !"my hbase".equals(name) && !"my_sfdc_ds".equals(name) && !"my_sugarcrm".equals(name)
&& !"my es index".equals(name) && !"my_datahub".equals(name)) {
final Datastore datastore = datastoreCatalog.getDatastore(name);
final DataContext dc;
try {
final DatastoreConnection connection = datastore.openConnection();
dc = connection.getDataContext();
assertNotNull(dc);
} catch (final RuntimeException e) {
throw new RuntimeException("Failed to read from datastore: " + name, e);
}
}
}
final Datastore compositeDatastore = datastoreCatalog.getDatastore("my_composite");
{
try (DatastoreConnection con = compositeDatastore.openConnection()) {
final DataContext dataContext = con.getDataContext();
final String[] schemaNames = dataContext.getSchemaNames();
assertEquals("[PUBLIC, Spreadsheet2003.xls, developers.mdb, resources]", Arrays.toString(schemaNames));
}
}
}
private DataCleanerConfiguration getConfiguration() {
return reader.create(new File("src/test/resources/example-configuration-all-datastore-types.xml"));
}
private DatastoreCatalog getDataStoreCatalog(final DataCleanerConfiguration configuration) {
_datastoreCatalog = configuration.getDatastoreCatalog();
return _datastoreCatalog;
}
public void testReferenceDataCatalog() throws Exception {
final DataCleanerConfiguration conf = getConfigurationFromXMLFile();
final ReferenceDataCatalog referenceDataCatalog = conf.getReferenceDataCatalog();
final String[] dictionaryNames = referenceDataCatalog.getDictionaryNames();
assertEquals("[custom_dict, datastore_dict, textfile_dict, valuelist_dict]", Arrays.toString(dictionaryNames));
final LifeCycleHelper lifeCycleHelper = new LifeCycleHelper(conf, null, true);
Dictionary d = referenceDataCatalog.getDictionary("datastore_dict");
assertEquals("dict_ds", d.getDescription());
lifeCycleHelper.assignProvidedProperties(Descriptors.ofComponent(d.getClass()), d);
lifeCycleHelper.initialize(Descriptors.ofComponent(d.getClass()), d);
DictionaryConnection dictionaryConnection = d.openConnection(conf);
assertTrue(dictionaryConnection.containsValue("Patterson"));
assertTrue(dictionaryConnection.containsValue("Murphy"));
assertFalse(dictionaryConnection.containsValue("Gates"));
dictionaryConnection.close();
assertFalse(((DatastoreDictionary) d).isLoadIntoMemory());
d = referenceDataCatalog.getDictionary("textfile_dict");
assertEquals("dict_txt", d.getDescription());
lifeCycleHelper.initialize(Descriptors.ofComponent(d.getClass()), d);
dictionaryConnection = d.openConnection(conf);
assertTrue(dictionaryConnection.containsValue("Patterson"));
assertFalse(dictionaryConnection.containsValue("Murphy"));
assertTrue(dictionaryConnection.containsValue("Gates"));
dictionaryConnection.close();
d = referenceDataCatalog.getDictionary("valuelist_dict");
assertEquals("dict_simple", d.getDescription());
lifeCycleHelper.initialize(Descriptors.ofComponent(d.getClass()), d);
dictionaryConnection = d.openConnection(conf);
assertFalse(dictionaryConnection.containsValue("Patterson"));
assertFalse(dictionaryConnection.containsValue("Murphy"));
assertTrue(dictionaryConnection.containsValue("greetings"));
dictionaryConnection.close();
d = referenceDataCatalog.getDictionary("custom_dict");
assertEquals("dict_custom", d.getDescription());
lifeCycleHelper.initialize(Descriptors.ofComponent(d.getClass()), d);
dictionaryConnection = d.openConnection(conf);
assertFalse(dictionaryConnection.containsValue("Patterson"));
assertFalse(dictionaryConnection.containsValue("Murphy"));
assertFalse(dictionaryConnection.containsValue("Gates"));
assertTrue(dictionaryConnection.containsValue("value0"));
assertTrue(dictionaryConnection.containsValue("value1"));
assertTrue(dictionaryConnection.containsValue("value2"));
assertTrue(dictionaryConnection.containsValue("value3"));
assertTrue(dictionaryConnection.containsValue("value4"));
assertFalse(dictionaryConnection.containsValue("value5"));
dictionaryConnection.close();
final String[] synonymCatalogNames = referenceDataCatalog.getSynonymCatalogNames();
assertEquals("[custom_syn, datastore_syn, textfile_syn]", Arrays.toString(synonymCatalogNames));
SynonymCatalog synonymCatalog = referenceDataCatalog.getSynonymCatalog("textfile_syn");
assertEquals("syn_txt", synonymCatalog.getDescription());
lifeCycleHelper.initialize(Descriptors.ofComponent(synonymCatalog.getClass()), synonymCatalog);
SynonymCatalogConnection synonymConnection = synonymCatalog.openConnection(conf);
assertEquals("DNK", synonymConnection.getMasterTerm("Denmark"));
assertEquals("DNK", synonymConnection.getMasterTerm("Danmark"));
assertEquals("DNK", synonymConnection.getMasterTerm("DK"));
assertEquals("ALB", synonymConnection.getMasterTerm("Albania"));
assertEquals(null, synonymConnection.getMasterTerm("Netherlands"));
synonymConnection.close();
synonymCatalog = referenceDataCatalog.getSynonymCatalog("datastore_syn");
assertEquals("syn_ds", synonymCatalog.getDescription());
assertTrue(((DatastoreSynonymCatalog) synonymCatalog).isLoadIntoMemory());
lifeCycleHelper.assignProvidedProperties(Descriptors.ofComponent(synonymCatalog.getClass()), synonymCatalog);
lifeCycleHelper.initialize(Descriptors.ofComponent(synonymCatalog.getClass()), synonymCatalog);
synonymConnection = synonymCatalog.openConnection(conf);
// lookup by id
assertEquals("La Rochelle Gifts", synonymConnection.getMasterTerm("119"));
// lookup by phone number (string)
assertEquals("Danish Wholesale Imports", synonymConnection.getMasterTerm("31 12 3555"));
assertEquals(null, synonymConnection.getMasterTerm("foobar"));
synonymConnection.close();
synonymCatalog = referenceDataCatalog.getSynonymCatalog("custom_syn");
assertEquals("syn_custom", synonymCatalog.getDescription());
lifeCycleHelper.initialize(Descriptors.ofComponent(synonymCatalog.getClass()), synonymCatalog);
synonymConnection = synonymCatalog.openConnection(conf);
assertEquals("DNK", synonymConnection.getMasterTerm("Denmark"));
assertEquals("DNK", synonymConnection.getMasterTerm("Danmark"));
assertEquals(null, synonymConnection.getMasterTerm("DK"));
assertEquals(null, synonymConnection.getMasterTerm("Albania"));
assertEquals("NLD", synonymConnection.getMasterTerm("Netherlands"));
synonymConnection.close();
final String[] stringPatternNames = referenceDataCatalog.getStringPatternNames();
assertEquals("[regex danish email, simple email]", Arrays.toString(stringPatternNames));
StringPattern pattern = referenceDataCatalog.getStringPattern("regex danish email");
assertEquals("pattern_reg", pattern.getDescription());
lifeCycleHelper.initialize(Descriptors.ofComponent(pattern.getClass()), pattern);
assertEquals(
"RegexStringPattern[name=regex danish email, expression=[a-z]+@[a-z]+\\.dk, matchEntireString=true]",
pattern.toString());
StringPatternConnection patternConnection = pattern.openConnection(conf);
assertTrue(patternConnection.matches("kasper@eobjects.dk"));
assertFalse(patternConnection.matches("kasper@eobjects.org"));
assertFalse(patternConnection.matches(" kasper@eobjects.dk"));
patternConnection.close();
pattern = referenceDataCatalog.getStringPattern("simple email");
assertEquals("pattern_simple", pattern.getDescription());
lifeCycleHelper.initialize(Descriptors.ofComponent(pattern.getClass()), pattern);
assertEquals("SimpleStringPattern[name=simple email, expression=aaaa@aaaaa.aa]", pattern.toString());
patternConnection = pattern.openConnection(conf);
assertTrue(patternConnection.matches("kasper@eobjects.dk"));
assertTrue(patternConnection.matches("kasper@eobjects.org"));
assertFalse(patternConnection.matches(" kasper@eobjects.dk"));
patternConnection.close();
}
public void testCustomDictionaryWithInjectedDatastore() {
final DataCleanerConfiguration configuration = getConfigurationFromXMLFile();
final ReferenceDataCatalog referenceDataCatalog = configuration.getReferenceDataCatalog();
final SampleCustomDictionary sampleCustomDictionary =
(SampleCustomDictionary) referenceDataCatalog.getDictionary("custom_dict");
Assert.assertEquals("my_jdbc_connection", sampleCustomDictionary.datastore.getName());
}
private DataCleanerConfiguration getConfigurationFromXMLFile() {
return reader.create(new File("src/test/resources/example-configuration-all-reference-data-types.xml"));
}
public void testRemoteServerConfiguration() throws Exception {
final DataCleanerConfiguration configuration =
reader.create(new File("src/test/resources/example-configuration-remote-servers.xml"));
final RemoteServerConfiguration remoteConf = configuration.getEnvironment().getRemoteServerConfiguration();
Assert.assertEquals(false, remoteConf.getServerList().isEmpty());
Assert.assertEquals(3, remoteConf.getServerList().size());
final RemoteServerData server0 = remoteConf.getServerList().get(0);
Assert.assertEquals("server1", server0.getServerName());
Assert.assertEquals("http://host1:8888", server0.getUrl());
Assert.assertEquals("totoro", server0.getUsername());
Assert.assertEquals("admin", server0.getPassword());
final RemoteServerData server1 = remoteConf.getServerList().get(1);
Assert.assertEquals("serverHost2", server1.getServerName());
Assert.assertEquals("http://host2:8888", server1.getUrl());
Assert.assertEquals("momo", server1.getUsername());
Assert.assertEquals("admin", server1.getPassword());
final RemoteServerData server2 = remoteConf.getServerList().get(2);
Assert.assertEquals("serverHost3", server2.getServerName());
Assert.assertEquals("http://host3:8888", server2.getUrl());
Assert.assertEquals("momo", server2.getUsername());
Assert.assertEquals("admin", server2.getPassword());
}
public void testRemoteServerConfigurationDefault() throws Exception {
final DataCleanerConfiguration configuration =
reader.create(new File("src/test/resources/example-configuration-remote-servers-empty.xml"));
final RemoteServerConfiguration remoteConf = configuration.getEnvironment().getRemoteServerConfiguration();
Assert.assertEquals(true, remoteConf.getServerList().isEmpty());
Assert.assertEquals(0, remoteConf.getServerList().size());
}
public void testServerConfigurations() {
final DataCleanerConfiguration configuration = getConfigurationFromXMLFile();
final ServerInformationCatalog serverInformationCatalog = configuration.getServerInformationCatalog();
Assert.assertTrue(serverInformationCatalog.containsServer("environment"));
Assert.assertTrue(serverInformationCatalog.containsServer("directories"));
Assert.assertTrue(serverInformationCatalog.containsServer("namenode"));
final HadoopClusterInformation environment =
(HadoopClusterInformation) serverInformationCatalog.getServer("environment");
Assert.assertEquals("environment", environment.getName());
final HadoopClusterInformation namenode =
(HadoopClusterInformation) serverInformationCatalog.getServer("namenode");
Assert.assertEquals("namenode", namenode.getName());
Assert.assertEquals("hdfs://localhost:8020/", namenode.getConfiguration().get("fs.defaultFS"));
}
public void testReadReferenceDataWithResources() throws Exception {
final DataCleanerConfiguration configuration =
reader.create(new File("src/test/resources/example-configuration-reference-data-resource-paths.xml"));
final TextFileDictionary dictionary =
(TextFileDictionary) configuration.getReferenceDataCatalog().getDictionary("dictionary");
assertEquals("C:/absolute/path/to/dictionary.txt", dictionary.getFilename());
final TextFileDictionary dictionary2 =
(TextFileDictionary) configuration.getReferenceDataCatalog().getDictionary("dictionary2");
assertEquals("C:/absolute/path/to/dictionary.txt", dictionary2.getFilename());
final TextFileSynonymCatalog synonyms =
(TextFileSynonymCatalog) configuration.getReferenceDataCatalog().getSynonymCatalog("synonyms");
assertEquals("relative/path/to/synonyms.txt", synonyms.getFilename());
final TextFileSynonymCatalog synonyms2 =
(TextFileSynonymCatalog) configuration.getReferenceDataCatalog().getSynonymCatalog("synonyms2");
assertEquals("relative/path/to/synonyms.txt", synonyms2.getFilename());
}
public void testReadFixedWidthDatastore() throws Exception {
final DataCleanerConfiguration configuration =
reader.create(new File("src/test/resources/example-job-fixed-width-datastore.xml"));
assertEquals("[employees-hadoop, my fixed width ds]",
Arrays.toString(configuration.getDatastoreCatalog().getDatastoreNames()));
}
}