/** * DataCleaner (community edition) * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.datacleaner.connection; import java.io.File; import java.io.IOException; import java.io.ObjectInputStream; import java.lang.reflect.Field; import java.util.Arrays; import java.util.List; import org.apache.metamodel.DataContext; import org.apache.metamodel.DataContextFactory; import org.apache.metamodel.fixedwidth.EbcdicConfiguration; import org.apache.metamodel.fixedwidth.FixedWidthConfiguration; import org.apache.metamodel.util.FileResource; import org.apache.metamodel.util.Resource; import org.apache.metamodel.util.SerializableRef; import org.datacleaner.util.ReadObjectBuilder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Datastore based on fixed width files */ public class FixedWidthDatastore extends UsageAwareDatastore<DataContext> implements FileDatastore, ResourceDatastore { public static final String EBCDIC_PREFIX = "IBM0"; private static final long serialVersionUID = 1L; private static Logger logger = LoggerFactory.getLogger(FixedWidthDatastore.class); private final String _filename; private final String _encoding; private final int _fixedValueWidth; private final int[] _valueWidths; private final boolean _failOnInconsistencies; private final boolean _skipEbcdicHeader; private final boolean _eolPresent; private final int _headerLineNumber; private final List<String> _customColumnNames; private final SerializableRef<Resource> _resourceRef; public FixedWidthDatastore(final String name, final String filename, final String encoding, final int fixedValueWidth) { this(name, filename, encoding, fixedValueWidth, true, false, true); } public FixedWidthDatastore(final String name, final String filename, final String encoding, final int[] valueWidths) { this(name, filename, encoding, valueWidths, true, false, true); } public FixedWidthDatastore(final String name, final String filename, final String encoding, final int fixedValueWidth, final boolean failOnInconsistencies, final boolean skipEbcdicHeader, final boolean eolPresent) { this(name, filename, encoding, fixedValueWidth, failOnInconsistencies, skipEbcdicHeader, eolPresent, FixedWidthConfiguration.DEFAULT_COLUMN_NAME_LINE); } public FixedWidthDatastore(final String name, final String filename, final String encoding, final int[] valueWidths, final boolean failOnInconsistencies, final boolean skipEbcdicHeader, final boolean eolPresent) { this(name, filename, encoding, valueWidths, failOnInconsistencies, skipEbcdicHeader, eolPresent, FixedWidthConfiguration.DEFAULT_COLUMN_NAME_LINE); } public FixedWidthDatastore(final String name, final String filename, final String encoding, final int fixedValueWidth, final boolean failOnInconsistencies, final boolean skipEbcdicHeader, final boolean eolPresent, final int headerLineNumber) { this(name, null, filename, encoding, fixedValueWidth, failOnInconsistencies, skipEbcdicHeader, eolPresent, headerLineNumber); } public FixedWidthDatastore(final String name, Resource resource, final String filename, final String encoding, final int fixedValueWidth, final boolean failOnInconsistencies, final boolean skipEbcdicHeader, final boolean eolPresent, final int headerLineNumber) { super(name); _filename = filename; if (resource == null) { resource = new FileResource(filename); } _resourceRef = new SerializableRef<>(resource); _encoding = encoding; _fixedValueWidth = fixedValueWidth; _valueWidths = new int[0]; _failOnInconsistencies = failOnInconsistencies; _skipEbcdicHeader = skipEbcdicHeader; _eolPresent = eolPresent; _headerLineNumber = headerLineNumber; _customColumnNames = null; } public FixedWidthDatastore(final String name, final String filename, final String encoding, final int[] valueWidths, final boolean failOnInconsistencies, final boolean skipEbcdicHeader, final boolean eolPresent, final int headerLineNumber) { this(name, filename, encoding, valueWidths, failOnInconsistencies, skipEbcdicHeader, eolPresent, headerLineNumber, null); } public FixedWidthDatastore(final String name, final String filename, final String encoding, final int[] valueWidths, final boolean failOnInconsistencies, final boolean skipEbcdicHeader, final boolean eolPresent, final int headerLineNumber, final List<String> customColumnNames) { this(name, null, filename, encoding, valueWidths, failOnInconsistencies, skipEbcdicHeader, eolPresent, headerLineNumber, null); } public FixedWidthDatastore(final String name, Resource resource, final String filename, final String encoding, final int[] valueWidths, final boolean failOnInconsistencies, final boolean skipEbcdicHeader, final boolean eolPresent, final int headerLineNumber, final List<String> customColumnNames) { super(name); _filename = filename; if (resource == null) { resource = new FileResource(filename); } _resourceRef = new SerializableRef<>(resource); _encoding = encoding; _fixedValueWidth = -1; _valueWidths = valueWidths; _failOnInconsistencies = failOnInconsistencies; _skipEbcdicHeader = skipEbcdicHeader; _eolPresent = eolPresent; _headerLineNumber = headerLineNumber; _customColumnNames = customColumnNames; } private void readObject(final ObjectInputStream stream) throws IOException, ClassNotFoundException { final ReadObjectBuilder.Adaptor adaptor = (getField, serializable) -> { final String filename = (String) getField.get("_filename", ""); final Field field = FixedWidthDatastore.class.getDeclaredField("_resourceRef"); field.setAccessible(true); final FileResource fileResource = new FileResource(filename); final SerializableRef<Resource> resourceRef = new SerializableRef<>(fileResource); field.set(serializable, resourceRef); }; ReadObjectBuilder.create(this, FixedWidthDatastore.class).readObject(stream, adaptor); } @Override public PerformanceCharacteristics getPerformanceCharacteristics() { return new PerformanceCharacteristicsImpl(false, true); } @Override protected UsageAwareDatastoreConnection<DataContext> createDatastoreConnection() { final FixedWidthConfiguration configuration = getConfiguration(); final Resource resource = _resourceRef.get(); final DataContext dataContext; if (resource == null) { logger.warn("Resource was not available, a local file reference will be created with path: {}", _filename); dataContext = DataContextFactory.createFixedWidthDataContext(new File(_filename), configuration); } else { dataContext = DataContextFactory.createFixedWidthDataContext(resource, configuration); } return new DatastoreConnectionImpl<>(dataContext, this); } public FixedWidthConfiguration getConfiguration() { final FixedWidthConfiguration configuration; if (isEbcdic()) { if (_fixedValueWidth == -1) { configuration = new EbcdicConfiguration(_headerLineNumber, _encoding, _valueWidths, _failOnInconsistencies, _skipEbcdicHeader, _eolPresent); } else { configuration = new EbcdicConfiguration(_headerLineNumber, _encoding, _fixedValueWidth, _failOnInconsistencies, _skipEbcdicHeader, _eolPresent); } } else { if (_fixedValueWidth == -1) { configuration = new FixedWidthConfiguration(_headerLineNumber, _encoding, _valueWidths, _failOnInconsistencies); } else { configuration = new FixedWidthConfiguration(_headerLineNumber, _encoding, _fixedValueWidth, _failOnInconsistencies); } } return configuration; } public String getEncoding() { return _encoding; } public int getFixedValueWidth() { return _fixedValueWidth; } public int[] getValueWidths() { return _valueWidths; } public int getHeaderLineNumber() { return _headerLineNumber; } @Override public String getFilename() { return _filename; } @Override public Resource getResource() { if (_resourceRef == null) { return null; } return _resourceRef.get(); } public boolean isFailOnInconsistencies() { return _failOnInconsistencies; } public boolean isSkipEbcdicHeader() { return _skipEbcdicHeader; } public boolean isEolPresent() { return _eolPresent; } public List<String> getCustomColumnNames() { return _customColumnNames; } @Override protected void decorateIdentity(final List<Object> identifiers) { super.decorateIdentity(identifiers); identifiers.add(_filename); identifiers.add(_encoding); identifiers.add(_fixedValueWidth); identifiers.add(_valueWidths); identifiers.add(_headerLineNumber); identifiers.add(_failOnInconsistencies); identifiers.add(_skipEbcdicHeader); identifiers.add(_eolPresent); } private boolean isEbcdic() { // This is just a way how to differentiate between EBCDIC and normal FixedWidth configuration. // Perhaps there is a better way how to do this. return getEncoding().startsWith(EBCDIC_PREFIX); } @Override public String toString() { return "FixedWidthDatastore[name=" + getName() + ", filename=" + _filename + ", encoding=" + _encoding + ", headerLineNumber=" + _headerLineNumber + ", valueWidths=" + Arrays.toString(_valueWidths) + ", fixedValueWidth=" + _fixedValueWidth + "]"; } }