// ============================================================================
//
// Copyright (C) 2006-2015 Talend Inc. - www.talend.com
//
// This source code is available under agreement available at
// %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt
//
// You should have received a copy of the agreement
// along with this program; if not, write to Talend SA
// 9 rue Pages 92150 Suresnes, France
//
// ============================================================================
package org.talend.dataquality.datamasking.semantic;
import static org.junit.Assert.assertEquals;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Random;
import org.junit.Test;
import org.talend.dataquality.duplicating.AllDataqualitySamplingTests;
public class ValueDataMaskerTest {
private static final Map<String[], String> EXPECTED_MASKED_VALUES = new LinkedHashMap<String[], String>() {
private static final long serialVersionUID = 1L;
{
// 0. UNKNOWN
put(new String[] { " ", "UNKNOWN", "string" }, " ");
put(new String[] { "91000", "UNKNOWN", "integer" }, "86622");
put(new String[] { "92000", "UNKNOWN", "decimal" }, "87574");
put(new String[] { "93000", "UNKNOWN", "numeric" }, "88526");
put(new String[] { "2023-06-07", "UNKNOWN", "date" }, "2023-07-01");
put(new String[] { "sdkjs@talend.com", "UNKNOWN", "string" }, "vkfzz@psbbqg.aqa");
// 1. FIRST_NAME
put(new String[] { "", MaskableCategoryEnum.FIRST_NAME.name(), "string" }, "");
put(new String[] { "John", MaskableCategoryEnum.FIRST_NAME.name(), "string" }, "Josiah");
// 2. LAST_NAME
put(new String[] { "Dupont", MaskableCategoryEnum.LAST_NAME.name(), "string" }, "Robbins");
// 3. EMAIL
put(new String[] { "sdkjs@talend.com", MaskableCategoryEnum.EMAIL.name(), "String" }, "XXXXX@talend.com");
put(new String[] { "\t", MaskableCategoryEnum.FIRST_NAME.name(), "string" }, "\t");
// 4. PHONE
put(new String[] { "3333456789", MaskableCategoryEnum.US_PHONE.name(), "String" }, "3333699941");
// if we put two 1 at the fifth and sixth position, it's not a US valid number, so we replace all the digit
put(new String[] { "3333116789", MaskableCategoryEnum.US_PHONE.name(), "String" }, "2873888808");
put(new String[] { "321938", MaskableCategoryEnum.FR_PHONE.name(), "String" }, "996722");
put(new String[] { "++044dso44aa", MaskableCategoryEnum.DE_PHONE.name(), "String" }, "++287dso38aa");
put(new String[] { "666666666", MaskableCategoryEnum.UK_PHONE.name(), "String" }, "663330954");
put(new String[] { "777777777abc", MaskableCategoryEnum.UK_PHONE.name(), "String" }, "778886113abc");
put(new String[] { "(301) 231-9473 x 2364", MaskableCategoryEnum.US_PHONE.name(), "String" },
"(301) 231-9416 x 7116");
put(new String[] { "(563) 557-7600 Ext. 2890", MaskableCategoryEnum.US_PHONE.name(), "String" },
"(563) 557-7642 Ext. 4410");
// 5. JOB_TITLE
put(new String[] { "CEO", MaskableCategoryEnum.JOB_TITLE.name(), "String" }, "Cafeteria Cook");
// 6. ADDRESS_LINE
put(new String[] { "9 Rue Pagès", MaskableCategoryEnum.ADDRESS_LINE.name(), "String" }, "6 Rue XXXXX");
// 7 POSTAL_CODE
put(new String[] { "37218-1324", MaskableCategoryEnum.US_POSTAL_CODE.name(), "String" }, "32515-1655");
put(new String[] { "92150", MaskableCategoryEnum.FR_POSTAL_CODE.name(), "String" }, "32515");
put(new String[] { "63274", MaskableCategoryEnum.DE_POSTAL_CODE.name(), "String" }, "32515");
put(new String[] { "AT1 3BW", MaskableCategoryEnum.UK_POSTAL_CODE.name(), "String" }, "VK5 1ZP");
// 8 ORGANIZATION
// 9 COMPANY
// 10 CREDIT_CARD
put(new String[] { "5300 1232 8732 8318", MaskableCategoryEnum.US_CREDIT_CARD.name(), "String" },
"5332 5151 6550 0021");
put(new String[] { "5300123287328318", MaskableCategoryEnum.MASTERCARD.name(), "String" }, "5332515165500021");
put(new String[] { "4300 1232 8732 8318", MaskableCategoryEnum.VISACARD.name(), "String" }, "4325 1516 5500 0249");
// 11 SSN
put(new String[] { "728931789", MaskableCategoryEnum.US_SSN.name(), "String" }, "528-73-8888");
put(new String[] { "17612 38293 28232", MaskableCategoryEnum.FR_SSN.name(), "String" }, "2210622388880 15");
put(new String[] { "634217823", MaskableCategoryEnum.UK_SSN.name(), "String" }, "RB 87 38 88 D");
// Company
put(new String[] { "Talend", MaskableCategoryEnum.COMPANY.name(), "String" }, "Gilead Sciences");
// FR Commune
put(new String[] { "Amancey", MaskableCategoryEnum.FR_COMMUNE.name(), "String" }, "Dieppe");
// Organization
put(new String[] { "Kiva", MaskableCategoryEnum.ORGANIZATION.name(), "String" }, "Environmental Defense");
// EMPTY
put(new String[] { " ", "UNKNOWN", "integer" }, " ");
put(new String[] { " ", "UNKNOWN", "numeric" }, " ");
put(new String[] { " ", "UNKNOWN", "decimal" }, " ");
put(new String[] { " ", "UNKNOWN", "date" }, " ");
// NUMERIC
put(new String[] { "111", "UNKNOWN", "integer" }, "106");
put(new String[] { "-222.2", "UNKNOWN", "integer" }, "-211.5");
put(new String[] { "333", "UNKNOWN", "numeric" }, "317");
put(new String[] { "444,44", "UNKNOWN", "numeric" }, "423.06");
put(new String[] { "555", "UNKNOWN", "float" }, "528");
put(new String[] { "666.666", "UNKNOWN", "float" }, "634.595");
put(new String[] { "Abc123", "UNKNOWN", "float" }, "Zzp655"); // not numeric, mask by char replacement
// BIG NUMERIC
put(new String[] { "7777777777777777777777777777777777777", "UNKNOWN", "double" },
"7403611837072083888888888888888888888");
put(new String[] { "7777777777777777777777777777777777777.7777", "UNKNOWN", "double" },
"7403611837072083888888888888888888888.8888");
// ENGINEERING FORMAT
put(new String[] { "8e28", "UNKNOWN", "double" }, "7.615143603845572E28");
put(new String[] { "-9.999E29", "UNKNOWN", "double" }, "-9.517977611856484E29");
}
};
/**
* Test method for {@link org.talend.dataquality.datamasking.DataMasker#process(java.lang.Object, boolean)}.
*
* @throws IllegalAccessException
* @throws InstantiationException
*/
@Test
public void testProcess() throws InstantiationException, IllegalAccessException {
for (String[] input : EXPECTED_MASKED_VALUES.keySet()) {
String inputValue = input[0];
String semanticCategory = input[1];
String dataType = input[2];
System.out.print("[" + semanticCategory + "]\n\t" + inputValue + " => ");
final ValueDataMasker masker = new ValueDataMasker(semanticCategory, dataType);
masker.getFunction().setRandom(new Random(AllDataqualitySamplingTests.RANDOM_SEED));
String maskedValue = masker.maskValue(inputValue);
System.out.println(maskedValue);
assertEquals("Test faild on [" + inputValue + "]", EXPECTED_MASKED_VALUES.get(input), maskedValue);
}
// Assert.assertNotEquals(city, masker.process(city));
// masker should generate a city name
// Assert the masked value is in a list of city names
// categories to mask
// First names, last names, email, IP address (v4, v6), localization, GPS coordinates, phone
// Job title , street, address, zipcode, organization, company, full name, credit card number, account number,
//
// for these categories, here are the default functions to use:
// first name -> another first name (from a fixed list loaded from a data file in a resource folder)
// last name -> another last name (from a fixed list)
// email -> mask local part (MaskEmail function)
// phone -> keep 3 first digits and replace last digits
// Job title -> another job title (from a fixed list)
// street -> use MaskAddress
// zipCode -> replace All digits
// organization -> another organization (from a fixed list)
// company -> another company (from a fixed list)
// credit card -> generate a new one
// account number -> generate a new one
//
// Assertions: masked data must never be identical to original data (don't use random seed for the random
// generator to check that)
//
// data types to mask
// date, string, numeric
// create a ValueDataMasker for data that have no semantic category
// use ValueDataMasker masker = SemanticCategoryMaskerFactory.createMasker(dataType);
// here are the default functions to use for the different types:
// date -> DateVariance with parameter 61 (meaning two months)
// string -> use ReplaceAll
// numeric -> use NumericVariance
}
}