/** * DataCleaner (community edition) * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.datacleaner.beans.transform; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.inject.Named; import org.datacleaner.api.Categorized; import org.datacleaner.api.Configured; import org.datacleaner.api.Description; import org.datacleaner.api.InputColumn; import org.datacleaner.api.InputRow; import org.datacleaner.api.OutputColumns; import org.datacleaner.api.Transformer; import org.datacleaner.components.categories.TextCategory; import org.datacleaner.components.convert.ConvertToStringTransformer; @Named("Remove substring") @Description( "Subtracts one or more substrings from a base text, i.e. [\"Hello world\",\"World\"] would yield \"Hello\".") @Categorized(TextCategory.class) public class RemoveSubstringTransformer implements Transformer { @Configured("Base text column") @Description("Column containing the text to subtract from") InputColumn<String> baseColumn; @Configured("Substring columns") @Description("Columns containing the substrings to remove from the base text") InputColumn<?>[] substringColumns; @Configured(value = "Match whole words only", required = false) @Description("If set, only whole words (surrounded by whitespace or punctuation) will be removed.\n" + " This prevents removing partial words.") boolean wholeWordsOnly = false; @Configured @Description("Should substring matching be case-sensitive or not?") boolean caseSensitive = true; @Override public OutputColumns getOutputColumns() { return new OutputColumns(String.class, baseColumn.getName() + " (substring removed)"); } @Override public String[] transform(final InputRow inputRow) { String subtractedString = inputRow.getValue(baseColumn); for (final InputColumn<?> inputColumn : substringColumns) { final Object value = inputRow.getValue(inputColumn); if (value instanceof List) { for (final Object element : (List<?>) value) { subtractedString = subtract(subtractedString, element); } } else { subtractedString = subtract(subtractedString, value); } } return new String[] { subtractedString }; } private String subtract(final String subtractedString, final Object element) { if (element == null || subtractedString == null) { return subtractedString; } final String substring = (caseSensitive ? ConvertToStringTransformer.transformValue(element) : ConvertToStringTransformer.transformValue(element).toLowerCase()); String resultingString = subtractedString; if (caseSensitive && !wholeWordsOnly) { // special case where we can do a very easy/effective // String.replace(..) operation return resultingString.replace(substring, ""); } String matchedString = (caseSensitive ? resultingString : resultingString.toLowerCase()); final Pattern substringPattern; if (wholeWordsOnly) { substringPattern = Pattern.compile("\\b" + Pattern.quote(substring) + "\\b"); } else { substringPattern = Pattern.compile(Pattern.quote(substring)); } Matcher matcher = substringPattern.matcher(matchedString); while (matcher.find()) { final int start = matcher.start(); final int end = matcher.end(); resultingString = resultingString.substring(0, start) + resultingString.substring(end); matchedString = (caseSensitive ? resultingString : resultingString.toLowerCase()); matcher = substringPattern.matcher(matchedString); } return resultingString; } }