/**
* DataCleaner (community edition)
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.datacleaner.beans.transform;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import javax.inject.Inject;
import javax.inject.Named;
import org.apache.metamodel.util.HasName;
import org.datacleaner.api.Categorized;
import org.datacleaner.api.Configured;
import org.datacleaner.api.Description;
import org.datacleaner.api.Initialize;
import org.datacleaner.api.InputColumn;
import org.datacleaner.api.InputRow;
import org.datacleaner.api.OutputColumns;
import org.datacleaner.api.Provided;
import org.datacleaner.api.Transformer;
import org.datacleaner.api.Validate;
import org.datacleaner.components.categories.TextCategory;
import org.datacleaner.configuration.DataCleanerConfiguration;
import org.datacleaner.reference.Dictionary;
import org.datacleaner.reference.DictionaryConnection;
import org.datacleaner.reference.ReferenceDataCatalog;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.BreakIterator;
@Named("Text case transformer")
@Description("Modifies the text case/capitalization of Strings.")
@Categorized(TextCategory.class)
public class TextCaseTransformer implements Transformer {
/**
* Enum depicting the modes of operation for the text case modifications.
*/
public enum TransformationMode implements HasName {
LOWER_CASE("Lower case"),
UPPER_CASE("Upper case"),
CAPITALIZE_SENTENCES("Capitalize sentences"),
CAPITALIZE_WORDS("Capitalize every word");
private final String _name;
TransformationMode(final String name) {
_name = name;
}
@Override
public String getName() {
return _name;
}
}
public static final String VALUE_PROPERTY = "Value";
public static final String MODE_PROPERTY = "Mode";
public static final String ALL_WORDS_DICTIONARY_PROPERTY = "Dictionaries for casing complete value";
public static final String WORD_DICTIONARY_PROPERTY = "Dictionaries for casing individual words";
public static final String BEGIN_WORD_DICTIONARY_PROPERTY = "Dictionaries for casing beginning of words";
public static final String END_WORD_DICTIONARY_PROPERTY = "Dictionaries for casing ending of words";
@Configured(VALUE_PROPERTY)
InputColumn<String> valueColumn;
@Configured(MODE_PROPERTY)
TransformationMode mode = TransformationMode.UPPER_CASE;
@Configured(value = ALL_WORDS_DICTIONARY_PROPERTY, required = false, order = 11)
Dictionary[] allWordsDictionaries = {};
@Configured(value = WORD_DICTIONARY_PROPERTY, required = false, order = 12)
Dictionary[] wordDictionaries = {};
@Configured(value = BEGIN_WORD_DICTIONARY_PROPERTY, required = false, order = 13)
Dictionary[] wordStartDictionaries = {};
@Configured(value = END_WORD_DICTIONARY_PROPERTY, required = false, order = 14)
Dictionary[] wordEndDictionaries = {};
@Provided
DataCleanerConfiguration _configuration;
private DictionaryConnection[] allWordsDictionaryConnections = {};
private DictionaryConnection[] wordDictionaryConnections = {};
private DictionaryConnection[] wordStartDictionaryConnections = {};
private DictionaryConnection[] wordEndDictionaryConnections = {};
private DictionaryConnection[] openConnections(final Dictionary[] dictionaries) {
return Stream.of(dictionaries).map(d -> d.openConnection(_configuration)).toArray(DictionaryConnection[]::new);
}
@Initialize
public void init() {
allWordsDictionaryConnections = openConnections(allWordsDictionaries);
wordDictionaryConnections = openConnections(wordDictionaries);
wordStartDictionaryConnections = openConnections(wordStartDictionaries);
wordEndDictionaryConnections = openConnections(wordEndDictionaries);
}
@Validate
public void validate() {
validateDictionaries(allWordsDictionaries);
validateDictionaries(wordDictionaries);
validateDictionaries(wordStartDictionaries);
validateDictionaries(wordEndDictionaries);
}
private void validateDictionaries(final Dictionary[] dictionaries) {
if (!Stream.of(dictionaries).allMatch(Dictionary::isCaseSensitive)) {
throw new IllegalStateException("Dictionaries must be case sensitive");
}
}
@Override
public OutputColumns getOutputColumns() {
return new OutputColumns(String.class, valueColumn.getName() + " (" + mode.getName() + ")");
}
@Override
public String[] transform(final InputRow row) {
final String value = row.getValue(valueColumn);
final String[] result = new String[1];
result[0] = transform(value);
return result;
}
public String transform(final String value) {
if (value == null) {
return null;
}
switch (mode) {
case UPPER_CASE:
return UCharacter.toUpperCase(value);
case LOWER_CASE:
return UCharacter.toLowerCase(value);
case CAPITALIZE_SENTENCES:
return UCharacter.toTitleCase(value, BreakIterator.getSentenceInstance());
case CAPITALIZE_WORDS:
return capitalizeWordsByDictionaries(value);
default:
throw new UnsupportedOperationException("Unsupported mode: " + mode);
}
}
private String capitalizeWordsByDictionaries(final String value) {
final String preparedString = UCharacter.toTitleCase(value, BreakIterator.getWordInstance());
for (final DictionaryConnection allWordsDictionaryConnection : allWordsDictionaryConnections) {
final Iterator<String> lengthSortedValues = allWordsDictionaryConnection.getLengthSortedValues();
while (lengthSortedValues.hasNext()) {
final String candidate = lengthSortedValues.next();
if (candidate.equalsIgnoreCase(value)) {
return candidate;
}
}
}
return getAllWords(preparedString).stream().map(this::capitalizeWordByDictionaries)
.collect(Collectors.joining());
}
private String capitalizeWordByDictionaries(final String input) {
final Stream<String> wordStream =
Arrays.stream(wordDictionaryConnections).flatMap(DictionaryConnection::stream);
return wordStream.filter(input::equalsIgnoreCase).findFirst().orElseGet(() -> {
final String startReplaced = replaceBeginning(input).orElse(input);
return replaceEnd(startReplaced).orElse(startReplaced);
});
}
private Optional<String> replaceBeginning(final String input) {
final Stream<String> wordStartStream =
Arrays.stream(wordStartDictionaryConnections).flatMap(DictionaryConnection::stream);
return wordStartStream.filter(c -> input.length() > c.length())
.filter(c -> input.toLowerCase().startsWith(c.toLowerCase()))
.map(c -> c.concat(input.substring(c.length()))).findFirst();
}
private Optional<String> replaceEnd(final String input) {
final Stream<String> wordEndStream =
Arrays.stream(wordEndDictionaryConnections).flatMap(DictionaryConnection::stream);
return wordEndStream.filter(c -> input.length() > c.length())
.filter(c -> input.toLowerCase().endsWith(c.toLowerCase()))
.map(c -> input.substring(0, input.length() - c.length()).concat(c)).findFirst();
}
private List<String> getAllWords(final String preparedString) {
final List<String> words = new ArrayList<>();
final BreakIterator breakIterator = BreakIterator.getWordInstance();
breakIterator.setText(preparedString);
int start = breakIterator.first();
for (int end = breakIterator.next(); end != BreakIterator.DONE; start = end, end = breakIterator.next()) {
words.add(preparedString.substring(start, end));
}
return words;
}
}