/**
* DataCleaner (community edition)
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.datacleaner.reference;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import org.apache.metamodel.util.FileHelper;
import org.apache.metamodel.util.Resource;
import org.datacleaner.configuration.DataCleanerConfiguration;
import org.datacleaner.util.ReadObjectBuilder;
import org.datacleaner.util.convert.ResourceConverter;
import au.com.bytecode.opencsv.CSVParser;
/**
* Synonym catalog based on a text file.
*
* Each line in the file should contain a master term with trailing
* comma-separated synonyms.
*
* Example:
*
* <pre>
* DK,Denmark,Danmark,Dänemark
* NL,Holland,The Netherlands
* FR,France
* </pre>
*/
public final class TextFileSynonymCatalog extends AbstractReferenceData implements SynonymCatalog {
private static final long serialVersionUID = 1L;
private final String _filename;
private final boolean _caseSensitive;
private final String _encoding;
public TextFileSynonymCatalog(final String name, final File file, final boolean caseSensitive,
final String encoding) {
this(name, file.getPath(), caseSensitive, encoding);
}
public TextFileSynonymCatalog(final String name, final String filename, final boolean caseSensitive,
final String encoding) {
super(name);
_filename = filename;
_caseSensitive = caseSensitive;
_encoding = encoding;
}
@Override
public SynonymCatalogConnection openConnection(final DataCleanerConfiguration configuration) {
final ResourceConverter rc = new ResourceConverter(configuration);
final Resource resource = rc.fromString(Resource.class, _filename);
final Map<String, String> synonyms = resource.read(in -> {
final Map<String, String> synonyms1 = new HashMap<>();
final CSVParser parser = new CSVParser(',', '"', '\\');
final BufferedReader reader = FileHelper.getBufferedReader(in, _encoding);
try {
for (String line = reader.readLine(); line != null; line = reader.readLine()) {
line = line.trim();
final String[] values;
try {
values = parser.parseLine(line);
} catch (final Exception e) {
throw new IllegalStateException("Failed to parse line: " + line, e);
}
if (values.length > 0) {
synonyms1.put(values[0], values[0]);
}
if (values.length > 1) {
for (int i = 1; i < values.length; i++) {
synonyms1.put(values[i], values[0]);
}
}
}
} catch (final IOException e) {
throw new IllegalStateException(e);
} finally {
FileHelper.safeClose(reader);
}
return synonyms1;
});
return new SimpleSynonymCatalog(getName(), synonyms, _caseSensitive).openConnection(configuration);
}
private void readObject(final ObjectInputStream stream) throws IOException, ClassNotFoundException {
ReadObjectBuilder.create(this, TextFileSynonymCatalog.class).readObject(stream);
}
@Override
public boolean equals(final Object obj) {
if (super.equals(obj)) {
final TextFileSynonymCatalog other = (TextFileSynonymCatalog) obj;
return Objects.equals(_filename, other._filename) && Objects.equals(_caseSensitive, other._caseSensitive)
&& Objects.equals(_encoding, other._encoding);
}
return false;
}
@Override
public String toString() {
return "TextFileSynonymCatalog[name=" + getName() + ", filename=" + _filename + ", caseSensitive="
+ _caseSensitive + ", encoding=" + _encoding + "]";
}
public String getEncoding() {
return _encoding;
}
public String getFilename() {
return _filename;
}
@Override
public boolean isCaseSensitive() {
return _caseSensitive;
}
}