// ============================================================================ // // Copyright (C) 2006-2016 Talend Inc. - www.talend.com // // This source code is available under agreement available at // %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt // // You should have received a copy of the agreement // along with this program; if not, write to Talend SA // 9 rue Pages 92150 Suresnes, France // // ============================================================================ package org.talend.dataquality.duplicating; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Random; import org.apache.commons.codec.language.RefinedSoundex; public class FieldModifier { public enum Function { REPLACE_LETTER, ADD_LETTER, REPLACE_DIGIT, ADD_DIGIT, REMOVE_LETTER, REMOVE_DIGIT, EXCHANGE_CHAR, SOUNDEX_REPLACE, // find this function at SynonymReplaceAction class of org.talend.dataquality.standardization SYNONYM_REPLACE, SET_TO_BLANK, SET_TO_NULL, MODIFY_DATE_VALUE, SWITCH_DAY_MONTH_VALUE, REPLACE_BY_RANDOM_DATE } private static final String LETTER = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; //$NON-NLS-1$ private static final String DIGIT = "0123456789"; //$NON-NLS-1$ public static final String US_ENGLISH_MAPPING_STRING = RefinedSoundex.US_ENGLISH_MAPPING_STRING; private static final String EMPTY_STRING = ""; //$NON-NLS-1$ private final Random random = new Random(); private static char[] soundexMap = US_ENGLISH_MAPPING_STRING.toCharArray(); private Map<Character, List<Character>> inverseSoundexMap; private DateChanger dateChanger = new DateChanger(); public void setSeed(long seed) { random.setSeed(seed); dateChanger.setSeed(seed); } private Map<Character, List<Character>> getInverseSoundexMap() { if (inverseSoundexMap == null) { inverseSoundexMap = new HashMap<Character, List<Character>>(); for (int i = 0; i < soundexMap.length; i++) { List<Character> charSet = inverseSoundexMap.get(soundexMap[i]); if (charSet == null) { charSet = new ArrayList<Character>(); inverseSoundexMap.put(soundexMap[i], charSet); } charSet.add((char) ('A' + i)); } } return inverseSoundexMap; } public Date generateDuplicate(Date date, Function function, int modifCount, String extraParameter) { if (date == null || function == null) { return date; } Date newDate = new Date(date.getTime()); switch (function) { case SET_TO_NULL: return null; case MODIFY_DATE_VALUE: for (int i = 0; i < modifCount; i++) { dateChanger.modifyDateValue(newDate); } return newDate; case SWITCH_DAY_MONTH_VALUE: for (int i = 0; i < modifCount; i++) { dateChanger.switchDayMonthValue(newDate); } return newDate; case REPLACE_BY_RANDOM_DATE: for (int i = 0; i < modifCount; i++) { dateChanger.replaceWithRandomDate(newDate); } return newDate; default: break; } return date; } public String generateDuplicateString(String str, Function function, int modifCount, String extraParameter) { StringBuilder sb = new StringBuilder(str); switch (function) { case REPLACE_LETTER: if (sb.length() > 0) { for (int i = 0; i < modifCount; i++) { int pos = random.nextInt(sb.length()); int idx = random.nextInt(LETTER.length()); sb.setCharAt(pos, LETTER.charAt(idx)); } } break; case ADD_LETTER: for (int i = 0; i < modifCount; i++) { int pos = sb.length() == 0 ? 0 : random.nextInt(sb.length()); int idx = random.nextInt(LETTER.length()); sb.insert(pos, LETTER.charAt(idx)); } break; case REPLACE_DIGIT: if (sb.length() > 0) { for (int i = 0; i < modifCount; i++) { int pos = random.nextInt(sb.length()); int idx = random.nextInt(DIGIT.length()); sb.setCharAt(pos, DIGIT.charAt(idx)); } } break; case ADD_DIGIT: for (int i = 0; i < modifCount; i++) { int pos = sb.length() == 0 ? 0 : random.nextInt(sb.length()); int idx = random.nextInt(DIGIT.length()); if (pos == 0) { idx = random.nextInt(DIGIT.length() - 1) + 1; } sb.insert(pos, DIGIT.charAt(idx)); } break; case REMOVE_LETTER: for (int i = 0; i < modifCount; i++) { if (sb.length() > 0) { int pos = random.nextInt(sb.length()); sb.deleteCharAt(pos); } } break; case REMOVE_DIGIT: for (int i = 0; i < modifCount; i++) { if (sb.length() > 1) { int pos = random.nextInt(sb.length()); sb.deleteCharAt(pos); } } break; case EXCHANGE_CHAR: if (sb.length() > 1) { for (int i = 0; i < modifCount; i++) { int pos1 = random.nextInt(sb.length()); int pos2 = random.nextInt(sb.length()); if (pos1 != pos2) { char ch1 = sb.charAt(pos1); char ch2 = sb.charAt(pos2); sb.setCharAt(pos1, ch2); sb.setCharAt(pos2, ch1); } } } break; case SOUNDEX_REPLACE: if (sb.length() > 0) { List<Character> charSet = new ArrayList<Character>(); for (int i = 0; i < modifCount; i++) { int pos = random.nextInt(sb.length()); char charToReplace = sb.charAt(pos); int idx = Character.toUpperCase(charToReplace) - 'A'; if (idx >= 0 && idx < 26) { List<Character> soundexSet = getInverseSoundexMap().get(soundexMap[idx]); if (soundexSet != null) { charSet.clear(); charSet.addAll(soundexSet); charSet.remove(charSet.indexOf(Character.toUpperCase(charToReplace))); if (!charSet.isEmpty()) { Character[] charArray = charSet.toArray(new Character[charSet.size()]); Character newChar = charArray[random.nextInt(charArray.length)]; if (Character.isLowerCase(charToReplace)) { newChar = Character.toLowerCase(newChar); } sb.setCharAt(pos, newChar); } } } } } break; default: return str; } return sb.toString(); } /** * Generate duplicates with modification. * * @param obj the value to be duplicated * @param function {@link Function} * @param modifCount the times of modification * @param synonymIndexPath the path of the lucene index being used to generate a similar value. * @return */ public Object generateDuplicate(Object obj, Function function, int modifCount, String extraParameter) { if (function == Function.SET_TO_BLANK) { return EMPTY_STRING; } else if (function == Function.SET_TO_NULL) { return null; } String originalStr = (obj == null) ? EMPTY_STRING : String.valueOf(obj); return generateDuplicateString(originalStr, function, modifCount, extraParameter); } /** * Getter for random. * * @return the random */ public Random getRandom() { return this.random; } }