/**
* DataCleaner (community edition)
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.datacleaner.reference;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.lang.reflect.Field;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.datacleaner.configuration.DataCleanerConfiguration;
import org.datacleaner.util.ReadObjectBuilder;
import org.datacleaner.util.ReadObjectBuilder.Adaptor;
import org.datacleaner.util.StringUtils;
/**
* The simplest implementation of {@link SynonymCatalog}. Based on an in-memory
* {@link Map} of values.
*/
public final class SimpleSynonymCatalog extends AbstractReferenceData implements SynonymCatalog {
private static final long serialVersionUID = 1L;
private final Map<String, String> _synonymMap;
private final boolean _caseSensitive;
public SimpleSynonymCatalog(final String name) {
this(name, new HashMap<>());
}
public SimpleSynonymCatalog(final String name, final Map<String, String> synonyms) {
this(name, synonyms, true);
}
public SimpleSynonymCatalog(final String name, final Map<String, String> synonyms, final boolean caseSensitive) {
super(name);
_caseSensitive = caseSensitive;
_synonymMap = synonyms;
}
public SimpleSynonymCatalog(final String name, final Synonym... synonyms) {
this(name);
for (final Synonym synonym : synonyms) {
addSynonym(synonym);
}
}
public SimpleSynonymCatalog(final String name, final List<Synonym> synonyms) {
this(name);
for (final Synonym synonym : synonyms) {
addSynonym(synonym);
}
}
private Map<String, String> createSingleWordSynonymMap() {
if (_caseSensitive) {
// in the case-sensitive scenario we can simply reuse the normal
// synonym map
return _synonymMap;
}
final Map<String, String> synonymMap = new HashMap<>();
final Set<Entry<String, String>> entries = _synonymMap.entrySet();
for (final Entry<String, String> entry : entries) {
final String synonym = entry.getKey();
final String masterTerm = entry.getValue();
if (StringUtils.isSingleWord(synonym)) {
synonymMap.put(synonym.toLowerCase(), masterTerm);
}
}
return synonymMap;
}
private SortedMap<String, String> createMultiWordSynonymMap() {
final SortedMap<String, String> synonymMap =
new TreeMap<>(Comparator.comparingInt(String::length).reversed().thenComparing(String::compareTo));
final Set<Entry<String, String>> entries = _synonymMap.entrySet();
for (final Entry<String, String> entry : entries) {
final String synonym = entry.getKey();
final String masterTerm = entry.getValue();
if (!StringUtils.isSingleWord(synonym)) {
if (_caseSensitive) {
synonymMap.put(synonym, masterTerm);
} else {
synonymMap.put(synonym.toLowerCase(), masterTerm);
}
}
}
return synonymMap;
}
private void readObject(final ObjectInputStream stream) throws IOException, ClassNotFoundException {
final Adaptor adaptor = (getField, serializable) -> {
final boolean caseSensitive = getField.get("_caseSensitive", true);
final Field field = SimpleSynonymCatalog.class.getDeclaredField("_caseSensitive");
field.setAccessible(true);
field.set(serializable, caseSensitive);
};
ReadObjectBuilder.create(this, SimpleSynonymCatalog.class).readObject(stream, adaptor);
}
private void addSynonym(final Synonym synonym) {
final String masterTerm = synonym.getMasterTerm();
{
final String key = _caseSensitive ? masterTerm : masterTerm.toLowerCase();
_synonymMap.put(key, masterTerm);
}
final Collection<String> values = synonym.getSynonyms();
for (final String value : values) {
final String key = _caseSensitive ? value : value.toLowerCase();
_synonymMap.put(key, masterTerm);
}
}
@Override
public boolean equals(final Object obj) {
if (super.equals(obj)) {
final SimpleSynonymCatalog other = (SimpleSynonymCatalog) obj;
return Objects.equals(_synonymMap, other._synonymMap) && Objects
.equals(_caseSensitive, other._caseSensitive);
}
return false;
}
@Override
public SynonymCatalogConnection openConnection(final DataCleanerConfiguration configuration) {
return new SynonymCatalogConnection() {
private final SortedMap<String, String> _sortedMultiWordSynonymMap = createMultiWordSynonymMap();
private final Map<String, String> _singleWordSynonymMap = createSingleWordSynonymMap();
@Override
public Collection<Synonym> getSynonyms() {
final Map<String, Synonym> synonyms = new TreeMap<>();
for (final Entry<String, String> synonymEntry : _synonymMap.entrySet()) {
final String masterTerm = synonymEntry.getValue();
final String synonymValue = synonymEntry.getKey();
MutableSynonym synonym = (MutableSynonym) synonyms.get(masterTerm);
if (synonym == null) {
synonym = new MutableSynonym(masterTerm);
synonyms.put(masterTerm, synonym);
}
synonym.addSynonym(synonymValue);
}
return synonyms.values();
}
@Override
public String getMasterTerm(final String term) {
if (term == null) {
return null;
}
final String key = _caseSensitive ? term : term.toLowerCase();
return _singleWordSynonymMap.get(key);
}
@Override
public Replacement replaceInline(String sentence) {
final List<String> synonyms = new ArrayList<>();
final List<String> masterTerms = new ArrayList<>();
// matchString will contain a copy of "sentence" but potentially
// lower-cased for case-insensitive matching
String matchString;
if (!_caseSensitive) {
matchString = sentence.toLowerCase();
} else {
matchString = sentence;
}
final Set<Entry<String, String>> entries = _sortedMultiWordSynonymMap.entrySet();
for (final Entry<String, String> entry : entries) {
final String synonym = entry.getKey();
final String masterTerm = entry.getValue();
final Matcher matcher = Pattern.compile("\\b" + synonym + "\\b").matcher(matchString);
while (matcher.find()) {
sentence =
sentence.substring(0, matcher.start()) + masterTerm + sentence.substring(matcher.end());
if (_caseSensitive) {
matchString = sentence.toLowerCase();
} else {
matchString = sentence;
}
synonyms.add(synonym);
masterTerms.add(masterTerm);
}
}
final StringBuilder sb = new StringBuilder();
final List<String> tokens = StringUtils.splitOnWordBoundaries(sentence, true);
for (final String token : tokens) {
if (StringUtils.isSingleWord(token)) {
final String masterTerm = getMasterTerm(token);
if (masterTerm == null) {
// no match, just add it
sb.append(token);
} else {
// match - add the master term
if (!masterTerm.equals(token)) {
synonyms.add(token);
masterTerms.add(masterTerm);
}
sb.append(masterTerm);
}
} else {
// it's a delim, just add it
sb.append(token);
}
}
final String finalSentence = sb.toString();
return new Replacement() {
@Override
public String getReplacedString() {
return finalSentence;
}
@Override
public List<String> getSynonyms() {
return synonyms;
}
@Override
public List<String> getMasterTerms() {
return masterTerms;
}
};
}
@Override
public void close() {
}
};
}
@Override
public boolean isCaseSensitive() {
return _caseSensitive;
}
public Map<String, String> getSynonymMap() {
return _synonymMap;
}
}