/**
* DataCleaner (community edition)
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.datacleaner.beans.filter;
import java.io.IOException;
import java.io.InputStream;
import java.sql.Timestamp;
import java.util.Date;
import java.util.Properties;
import javax.inject.Named;
import org.apache.metamodel.query.OperatorType;
import org.apache.metamodel.query.Query;
import org.apache.metamodel.schema.Column;
import org.apache.metamodel.schema.Table;
import org.apache.metamodel.util.Action;
import org.apache.metamodel.util.Resource;
import org.datacleaner.api.Categorized;
import org.datacleaner.api.Close;
import org.datacleaner.api.Configured;
import org.datacleaner.api.Description;
import org.datacleaner.api.Distributed;
import org.datacleaner.api.FileProperty;
import org.datacleaner.api.FileProperty.FileAccessMode;
import org.datacleaner.api.Initialize;
import org.datacleaner.api.InputColumn;
import org.datacleaner.api.InputRow;
import org.datacleaner.api.Optimizeable;
import org.datacleaner.api.QueryOptimizedFilter;
import org.datacleaner.components.categories.DateAndTimeCategory;
import org.datacleaner.components.categories.FilterCategory;
import org.datacleaner.components.convert.ConvertToDateTransformer;
import org.datacleaner.components.convert.ConvertToNumberTransformer;
import org.datacleaner.util.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Strings;
/**
* Filter for archieving a "change data capture" mechanism based on a
* "last modified" field. After each execution, the greatest timestamp is
* recorded and picked up successively by the next run.
*/
@Named("Capture changed records")
@Description("Include only records that have changed since the last time you ran the job. "
+ "This filter assumes a field containing the timestamp or a number field of the latest change for each "
+ "record, and stores the greatest encountered value in order to update the filter's future state.")
@Distributed(false)
@Categorized({ FilterCategory.class, DateAndTimeCategory.class })
@Optimizeable(removeableUponOptimization = false)
public class CaptureChangedRecordsFilter implements QueryOptimizedFilter<ValidationCategory> {
private static final Logger logger = LoggerFactory.getLogger(CaptureChangedRecordsFilter.class);
@Configured
@Description("Column containing the last modification timestamp or date or number.")
InputColumn<Object> lastModifiedColumn;
@Configured
@Description("A file used to persist and load the latest state of this data capture component.")
@FileProperty(extension = "properties", accessMode = FileAccessMode.SAVE)
Resource captureStateFile;
@Configured(required = false)
@Description(
"A custom identifier for this captured state. If omitted, the name of the 'Last modified column' will be used.")
String captureStateIdentifier;
private long _lastModifiedThresholdMillis = -1L;
private int _lastModifiedThresholdNanos = 0;
private long _greatestEncounteredMillis = -1L;
private int _greatestEncounteredNanos = 0;
@Initialize
public void initialize() throws IOException {
final Properties properties = loadProperties();
final String key = getPropertyKey();
final String lastModifiedStr = properties.getProperty(key);
if (!Strings.isNullOrEmpty(lastModifiedStr)) {
final int indexOfDot = lastModifiedStr.indexOf('.');
if (indexOfDot == -1) {
_lastModifiedThresholdMillis = convertToNumber(lastModifiedStr).longValue();
} else {
final String str1 = lastModifiedStr.substring(0, indexOfDot);
final String str2 = lastModifiedStr.substring(indexOfDot + 1);
_lastModifiedThresholdMillis = convertToNumber(str1).longValue();
_lastModifiedThresholdNanos = convertToNumber(str2).intValue();
}
}
}
@Override
public boolean isOptimizable(final ValidationCategory category) {
// only the valid category is optimizeable currently
return category == ValidationCategory.VALID;
}
@Override
public Query optimizeQuery(final Query q, final ValidationCategory category) {
assert category == ValidationCategory.VALID;
if (_lastModifiedThresholdMillis != -1L) {
final Column column = lastModifiedColumn.getPhysicalColumn();
if (column.getType().isTimeBased()) {
q.where(column, OperatorType.GREATER_THAN, createQueryOperand());
} else {
q.where(column, OperatorType.GREATER_THAN, _lastModifiedThresholdMillis);
}
}
return q;
}
private Date createQueryOperand() {
if (_lastModifiedThresholdNanos == 0) {
return new Date(_lastModifiedThresholdMillis);
}
final Timestamp ts = new Timestamp(_lastModifiedThresholdMillis);
ts.setNanos(_lastModifiedThresholdNanos);
return ts;
}
@Close(onFailure = false)
public void close() throws IOException {
if (_greatestEncounteredMillis != -1) {
final Properties properties = loadProperties();
final String key = getPropertyKey();
final String value;
if (_greatestEncounteredNanos == 0) {
value = "" + _greatestEncounteredMillis;
} else {
value = "" + _greatestEncounteredMillis + '.' + String.format("%09d", _greatestEncounteredNanos);
}
properties.setProperty(key, value);
captureStateFile.write(out -> properties.store(out, null));
}
}
/**
* Gets the key to use in the capture state file. If there is not a
* captureStateIdentifier available, we want to avoid using a hardcoded key,
* since the same file may be used for multiple purposes, even multiple
* filters of the same type. Of course this is not desired configuration,
* but may be more convenient for lazy users!
*
* @return
*/
private String getPropertyKey() {
if (StringUtils.isNullOrEmpty(captureStateIdentifier)) {
if (lastModifiedColumn.isPhysicalColumn()) {
final Table table = lastModifiedColumn.getPhysicalColumn().getTable();
if (table != null && !StringUtils.isNullOrEmpty(table.getName())) {
return table.getName() + "." + lastModifiedColumn.getName() + ".GreatestLastModifiedValue";
}
}
return lastModifiedColumn.getName() + ".GreatestLastModifiedValue";
}
return captureStateIdentifier.trim() + ".GreatestLastModifiedValue";
}
private Properties loadProperties() throws IOException {
final Properties properties = new Properties();
if (!captureStateFile.isExists()) {
logger.info("Capture state file does not exist: {}", captureStateFile);
return properties;
}
captureStateFile.read((Action<InputStream>) properties::load);
return properties;
}
@Override
public ValidationCategory categorize(final InputRow inputRow) {
final Object lastModified = inputRow.getValue(lastModifiedColumn);
final long rowMillis;
final int rowNanos;
if (lastModified == null) {
rowMillis = -1L;
rowNanos = 0;
} else if (lastModified instanceof Timestamp) {
final Timestamp ts = (Timestamp) lastModified;
rowMillis = ts.getTime();
rowNanos = ts.getNanos();
} else if (lastModified instanceof String) {
final Date date = ConvertToDateTransformer.getInternalInstance().transformValue(lastModified);
if (date == null) {
rowMillis = -1L;
} else {
rowMillis = date.getTime();
}
rowNanos = 0;
} else {
final Number lastModifiedAsNumber = convertToNumber(lastModified);
if (lastModifiedAsNumber == null) {
rowMillis = -1L;
} else {
rowMillis = lastModifiedAsNumber.longValue();
}
rowNanos = 0;
}
if (rowMillis != -1L) {
synchronized (this) {
if (_greatestEncounteredMillis == -1L || _greatestEncounteredMillis < rowMillis) {
_greatestEncounteredMillis = rowMillis;
_greatestEncounteredNanos = rowNanos;
} else if (_greatestEncounteredMillis == rowMillis && _greatestEncounteredNanos < rowNanos) {
_greatestEncounteredMillis = rowMillis;
_greatestEncounteredNanos = rowNanos;
}
}
}
if (_lastModifiedThresholdMillis == -1L) {
return ValidationCategory.VALID;
}
if (rowMillis == -1L) {
logger.info("Value of {} was not comparable, returning INVALID category: {}", lastModifiedColumn.getName(),
inputRow);
return ValidationCategory.INVALID;
}
if (_lastModifiedThresholdMillis < rowMillis) {
return ValidationCategory.VALID;
}
return ValidationCategory.INVALID;
}
private Number convertToNumber(final Object lastModified) {
return ConvertToNumberTransformer.transformValue(lastModified);
}
}