/* * ARX: Powerful Data Anonymization * Copyright 2012 - 2017 Fabian Prasser, Florian Kohlmayer and contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.deidentifier.arx; import java.io.File; import java.io.IOException; import java.io.OutputStream; import java.text.ParseException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Set; import org.apache.commons.math3.util.Pair; import org.deidentifier.arx.ARXLattice.ARXNode; import org.deidentifier.arx.DataHandleInternal.InterruptHandler; import org.deidentifier.arx.DataType.ARXDate; import org.deidentifier.arx.DataType.ARXDecimal; import org.deidentifier.arx.DataType.ARXInteger; import org.deidentifier.arx.DataType.DataTypeDescription; import org.deidentifier.arx.aggregates.StatisticsBuilder; import org.deidentifier.arx.certificate.elements.ElementData; import org.deidentifier.arx.io.CSVDataOutput; import org.deidentifier.arx.io.CSVSyntax; import org.deidentifier.arx.risk.RiskEstimateBuilder; import org.deidentifier.arx.risk.RiskModelHistogram; import cern.colt.Swapper; /** * This class provides access to dictionary encoded data. Furthermore, the data * is linked to the associated input or output data. This means that, e.g., if * the input data is sorted, the output data will be sorted accordingly. This * ensures that original tuples and their generalized counterpart will always * have the same row index, which is important for many use cases, e.g., for * graphical tools that allow to compare the original dataset to generalized * versions. * * @author Fabian Prasser * @author Florian Kohlmayer */ public abstract class DataHandle { /** The data types. */ protected DataType<?>[][] dataTypes = null; /** The data definition. */ protected DataDefinition definition = null; /** The header. */ protected String[] header = null; /** The node. */ protected ARXNode node = null; /** The current registry. */ protected DataRegistry registry = null; /** The current research subset. */ protected DataHandle subset = null; /** * Returns the name of the specified column. * * @param col The column index * @return the attribute name */ public abstract String getAttributeName(int col); /** * Returns the index of the given attribute, -1 if it is not in the header. * * @param attribute the attribute * @return the column index of */ public int getColumnIndexOf(final String attribute) { checkRegistry(); for (int i = 0; i < header.length; i++) { if (header[i].equals(attribute)) { return i; } } return -1; } /** * Returns the according data type. * * @param attribute the attribute * @return the data type */ public DataType<?> getDataType(final String attribute) { checkRegistry(); return definition.getDataType(attribute); } /** * Returns a date/time value from the specified cell. * * @param row The cell's row index * @param col The cell's column index * @return the date * @throws ParseException the parse exception */ public Date getDate(int row, int col) throws ParseException { String value = getValue(row, col); DataType<?> type = getDataType(getAttributeName(col)); if (type instanceof ARXDate) { return ((ARXDate) type).parse(value); } else { throw new ParseException("Invalid datatype: " + type.getClass().getSimpleName(), col); } } /** * Returns the data definition. * * @return the definition */ public DataDefinition getDefinition() { checkRegistry(); return definition; } /** * Returns an array containing the distinct values in the given column. * * @param column The column to process * @return the distinct values */ public final String[] getDistinctValues(int column) { return getDistinctValues(column, false, new InterruptHandler() { @Override public void checkInterrupt() { // Nothing to do } }); } /** * Returns a double value from the specified cell. * * @param row The cell's row index * @param col The cell's column index * @return the double * @throws ParseException the parse exception */ public Double getDouble(int row, int col) throws ParseException { String value = getValue(row, col); DataType<?> type = getDataType(getAttributeName(col)); if (type instanceof ARXDecimal) { return ((ARXDecimal) type).parse(value); } else if (type instanceof ARXInteger) { Long _long = ((ARXInteger) type).parse(value); return _long == null ? null : _long.doubleValue(); } else { throw new ParseException("Invalid datatype: " + type.getClass().getSimpleName(), col); } } /** * Returns a float value from the specified cell. * * @param row The cell's row index * @param col The cell's column index * @return the float * @throws ParseException the parse exception */ public Float getFloat(int row, int col) throws ParseException { String value = getValue(row, col); DataType<?> type = getDataType(getAttributeName(col)); if (type instanceof ARXDecimal) { Double _double = ((ARXDecimal) type).parse(value); return _double == null ? null : _double.floatValue(); } else if (type instanceof ARXInteger) { Long _long = ((ARXInteger) type).parse(value); return _long == null ? null : _long.floatValue(); } else { throw new ParseException("Invalid datatype: " + type.getClass().getSimpleName(), col); } } /** * Returns the generalization level for the attribute. * * @param attribute the attribute * @return the generalization */ public abstract int getGeneralization(String attribute); /** * Returns an int value from the specified cell. * * @param row The cell's row index * @param col The cell's column index * @return the int * @throws ParseException the parse exception */ public Integer getInt(int row, int col) throws ParseException { String value = getValue(row, col); DataType<?> type = getDataType(getAttributeName(col)); if (type instanceof ARXInteger) { Long _long = ((ARXInteger) type).parse(value); return _long == null ? null : _long.intValue(); } else { throw new ParseException("Invalid datatype: " + type.getClass().getSimpleName(), col); } } /** * Returns a long value from the specified cell. * * @param row The cell's row index * @param col The cell's column index * @return the long * @throws ParseException the parse exception */ public Long getLong(int row, int col) throws ParseException { String value = getValue(row, col); DataType<?> type = getDataType(getAttributeName(col)); if (type instanceof ARXInteger) { return ((ARXInteger) type).parse(value); } else { throw new ParseException("Invalid datatype: " + type.getClass().getSimpleName(), col); } } /** * Returns a mapping from data types to the relative number of values that conform to the according type. * This method uses the default locale. * This method only returns types that match at least 80% of all values in the column . * * @param column the column * @return the matching data types */ public List<Pair<DataType<?>, Double>> getMatchingDataTypes(int column) { return getMatchingDataTypes(column, Locale.getDefault(), 0.8d); } /** * Returns a mapping from data types to the relative number of values that conform to the according type for a given wrapped class. * This method uses the default locale. * This method only returns types that match at least 80% of all values in the column . * * @param <U> the generic type * @param column the column * @param clazz The wrapped class * @return the matching data types */ public <U> List<Pair<DataType<?>, Double>> getMatchingDataTypes(int column, Class<U> clazz) { return getMatchingDataTypes(column, clazz, Locale.getDefault(), 0.8d); } /** * Returns a mapping from data types to the relative number of values that conform to the according type for a given wrapped class. * This method uses the default locale. * * @param <U> the generic type * @param column the column * @param clazz The wrapped class * @param threshold Relative minimal number of values that must match to include a data type in the results * @return the matching data types */ public <U> List<Pair<DataType<?>, Double>> getMatchingDataTypes(int column, Class<U> clazz, double threshold) { return getMatchingDataTypes(column, clazz, Locale.getDefault(), threshold); } /** * Returns a mapping from data types to the relative number of values that conform to the according type for a given wrapped class. * This method only returns types that match at least 80% of all values in the column . * * @param <U> the generic type * @param column the column * @param clazz The wrapped class * @param locale The locale to use * @return the matching data types */ public <U> List<Pair<DataType<?>, Double>> getMatchingDataTypes(int column, Class<U> clazz, Locale locale) { return getMatchingDataTypes(column, clazz, locale, 0.8d); } /** * Returns a mapping from data types to the relative number of values that conform to the according type for a given wrapped class. * * @param <U> the generic type * @param column the column * @param clazz The wrapped class * @param locale The locale to use * @param threshold Relative minimal number of values that must match to include a data type in the results * @return the matching data types */ public <U> List<Pair<DataType<?>, Double>> getMatchingDataTypes(int column, Class<U> clazz, Locale locale, double threshold) { checkRegistry(); checkColumn(column); double distinct = this.getDistinctValues(column).length; List<Pair<DataType<?>, Double>> result = new ArrayList<Pair<DataType<?>, Double>>(); DataTypeDescription<U> description = DataType.list(clazz); if (description.hasFormat()) { for (String format : description.getExampleFormats()) { DataType<U> type = description.newInstance(format, locale); double matching = getNumConformingValues(column, type) / distinct; if (matching >= threshold) { result.add(new Pair<DataType<?>, Double>(type, matching)); } } } else { DataType<U> type = description.newInstance(); double matching = getNumConformingValues(column, type) / distinct; if (matching >= threshold) { result.add(new Pair<DataType<?>, Double>(type, matching)); } } return result; } /** * Returns a mapping from data types to the relative number of values that conform to the according type. * This method uses the default locale. * * @param column the column * @param threshold Relative minimal number of values that must match to include a data type in the results * @return the matching data types */ public List<Pair<DataType<?>, Double>> getMatchingDataTypes(int column, double threshold) { return getMatchingDataTypes(column, Locale.getDefault(), threshold); } /** * Returns a mapping from data types to the relative number of values that conform to the according type * This method only returns types that match at least 80% of all values in the column . * * @param column the column * @param locale The locale to use * @return the matching data types */ public List<Pair<DataType<?>, Double>> getMatchingDataTypes(int column, Locale locale) { return getMatchingDataTypes(column, locale, 0.8d); } /** * Returns a mapping from data types to the relative number of values that conform to the according type. * * @param column the column * @param locale The locale to use * @param threshold Relative minimal number of values that must match to include a data type in the results * @return the matching data types */ public List<Pair<DataType<?>, Double>> getMatchingDataTypes(int column, Locale locale, double threshold) { checkRegistry(); checkColumn(column); List<Pair<DataType<?>, Double>> result = new ArrayList<Pair<DataType<?>, Double>>(); result.addAll(getMatchingDataTypes(column, Long.class, locale, threshold)); result.addAll(getMatchingDataTypes(column, Date.class, locale, threshold)); result.addAll(getMatchingDataTypes(column, Double.class, locale, threshold)); result.add(new Pair<DataType<?>, Double>(DataType.STRING, 1.0d)); // Sort order final Map<Class<?>, Integer> order = new HashMap<Class<?>, Integer>(); order.put(Long.class, 0); order.put(Date.class, 1); order.put(Double.class, 2); order.put(String.class, 3); // Sort Collections.sort(result, new Comparator<Pair<DataType<?>, Double>>() { public int compare(Pair<DataType<?>, Double> o1, Pair<DataType<?>, Double> o2) { // Sort by matching quality int cmp = o1.getSecond().compareTo(o2.getSecond()); if (cmp != 0) return -cmp; // Sort by order int order1 = order.get(o1.getFirst().getDescription().getWrappedClass()); int order2 = order.get(o2.getFirst().getDescription().getWrappedClass()); return Integer.compare(order1, order2); } }); return result; } /** * Returns a set of values that do not conform to the given data type. * * @param column The column to test * @param type The type to test * @param max The maximal number of values returned by this method * @return the non conforming values */ public String[] getNonConformingValues(int column, DataType<?> type, int max) { checkRegistry(); checkColumn(column); Set<String> result = new HashSet<String>(); for (String value : this.getDistinctValues(column)) { if (!type.isValid(value)) { result.add(value); } if (result.size() == max) { break; } } return result.toArray(new String[result.size()]); } /** * Returns the number of columns in the dataset. * * @return the num columns */ public abstract int getNumColumns(); /** * Returns the number of (distinct) values that conform to the given data type. * * @param column The column to test * @param type The type to test * @return the num conforming values */ public int getNumConformingValues(int column, DataType<?> type) { checkRegistry(); checkColumn(column); int count = 0; for (String value : this.getDistinctValues(column)) { count += type.isValid(value) ? 1 : 0; } return count; } /** * Returns the number of rows in the dataset. * * @return the num rows */ public abstract int getNumRows(); /** * Returns a risk estimator * @param model * @return */ public RiskEstimateBuilder getRiskEstimator(ARXPopulationModel model) { return getRiskEstimator(model, getDefinition().getQuasiIdentifyingAttributes()); } /** * Returns a risk estimator * @param model * @param config * @return */ public RiskEstimateBuilder getRiskEstimator(ARXPopulationModel model, ARXSolverConfiguration config) { return getRiskEstimator(model, getDefinition().getQuasiIdentifyingAttributes(), config); } /** * Returns a risk estimator for the given set of equivalence classes. Saves resources by re-using existing classes * @param model * @param classes * @return */ public RiskEstimateBuilder getRiskEstimator(ARXPopulationModel model, RiskModelHistogram classes) { return new RiskEstimateBuilder(model, new DataHandleInternal(this), classes, getConfiguration()); } /** * Returns a risk estimator for the given set of equivalence classes. Saves resources by re-using existing classes * @param model * @param classes * @param config * @return */ public RiskEstimateBuilder getRiskEstimator(ARXPopulationModel model, RiskModelHistogram classes, ARXSolverConfiguration config) { return new RiskEstimateBuilder(model, new DataHandleInternal(this), classes, config, getConfiguration()); } /** * Returns a risk estimator for the given set of quasi-identifiers * @param model * @param qis * @return */ public RiskEstimateBuilder getRiskEstimator(ARXPopulationModel model, Set<String> qis) { return new RiskEstimateBuilder(model, new DataHandleInternal(this), qis, getConfiguration()); } /** * Returns a risk estimator for the given set of quasi-identifiers * @param model * @param qis * @param config * @return */ public RiskEstimateBuilder getRiskEstimator(ARXPopulationModel model, Set<String> qis, ARXSolverConfiguration config) { return new RiskEstimateBuilder(model, new DataHandleInternal(this), qis, config, getConfiguration()); } /** * Returns an object providing access to basic descriptive statistics about the data represented * by this handle. * * @return the statistics */ public abstract StatisticsBuilder getStatistics(); /** * Returns the transformation . * * @return the transformation */ public ARXNode getTransformation() { return node; } /** * Returns the value in the specified cell. * * @param row The cell's row index * @param col The cell's column index * @return the value */ public abstract String getValue(int row, int col); /** * Returns a new data handle that represents a context specific view on the dataset. * * @return the view */ public DataHandle getView() { checkRegistry(); if (subset == null) { return this; } else { return subset; } } /** * Has this handle been optimized with local recoding? * @return */ public boolean isOptimized() { checkRegistry(); return false; } /** * Determines whether this handle is orphaned, i.e., should not be used anymore * * @return true, if is orphaned */ public boolean isOrphaned() { return registry == null; } /** * Determines whether a given row is an outlier in the currently associated * data transformation. * * @param row the row * @return true, if is outlier */ public boolean isOutlier(int row) { checkRegistry(); return registry.isOutlier(this, row); } /** * Returns an iterator over the data. * * @return the iterator */ public abstract Iterator<String[]> iterator(); /** * Releases this handle and all associated resources. If a input handle is released all associated results are released * as well. */ public void release() { if (registry != null) { registry.release(this); } } /** * Renders this object * @return */ public ElementData render() { ElementData data = new ElementData("Data"); data.addProperty("Records", this.getNumRows()); data.addProperty("Attributes", this.getNumColumns()); return data; } /** * Replaces the original value with the replacement in the given column. Only supported by * handles for input data. * * @param column the column * @param original the original * @param replacement the replacement * @return Whether the original value was found */ public boolean replace(int column, String original, String replacement) { checkRegistry(); checkColumn(column); if (!getDataType(getAttributeName(column)).isValid(replacement)) { throw new IllegalArgumentException("Value does'nt match the attribute's data type"); } for (String s : getDistinctValues(column)) { if (s.equals(replacement)) { throw new IllegalArgumentException("Value is already contained in the data set"); } } return registry.replace(column, original, replacement); } /** * Writes the data to a CSV file. * * @param file the file * @throws IOException Signals that an I/O exception has occurred. */ public void save(final File file) throws IOException { checkRegistry(); final CSVDataOutput output = new CSVDataOutput(file); output.write(iterator()); } /** * Writes the data to a CSV file. * * @param file A file * @param separator The utilized separator character * @throws IOException Signals that an I/O exception has occurred. */ public void save(final File file, final char separator) throws IOException { checkRegistry(); final CSVDataOutput output = new CSVDataOutput(file, separator); output.write(iterator()); } /** * Writes the data to a CSV file. * * @param file the file * @param config the config * @throws IOException Signals that an I/O exception has occurred. */ public void save(final File file, final CSVSyntax config) throws IOException { checkRegistry(); final CSVDataOutput output = new CSVDataOutput(file, config); output.write(iterator()); } /** * Writes the data to a CSV file. * * @param out the out * @throws IOException Signals that an I/O exception has occurred. */ public void save(final OutputStream out) throws IOException { checkRegistry(); final CSVDataOutput output = new CSVDataOutput(out); output.write(iterator()); } /** * Writes the data to a CSV file. * * @param out Output stream * @param separator The utilized separator character * @throws IOException Signals that an I/O exception has occurred. */ public void save(final OutputStream out, final char separator) throws IOException { checkRegistry(); final CSVDataOutput output = new CSVDataOutput(out, separator); output.write(iterator()); } /** * Writes the data to a CSV file. * * @param out the out * @param config the config * @throws IOException Signals that an I/O exception has occurred. */ public void save(final OutputStream out, final CSVSyntax config) throws IOException { checkRegistry(); final CSVDataOutput output = new CSVDataOutput(out, config); output.write(iterator()); } /** * Writes the data to a CSV file. * * @param path the path * @throws IOException Signals that an I/O exception has occurred. */ public void save(final String path) throws IOException { checkRegistry(); final CSVDataOutput output = new CSVDataOutput(path); output.write(iterator()); } /** * Writes the data to a CSV file. * * @param path A path * @param separator The utilized separator character * @throws IOException Signals that an I/O exception has occurred. */ public void save(final String path, final char separator) throws IOException { checkRegistry(); final CSVDataOutput output = new CSVDataOutput(path, separator); output.write(iterator()); } /** * Writes the data to a CSV file. * * @param path the path * @param config the config * @throws IOException Signals that an I/O exception has occurred. */ public void save(final String path, final CSVSyntax config) throws IOException { checkRegistry(); final CSVDataOutput output = new CSVDataOutput(path, config); output.write(iterator()); } /** * Sorts the dataset according to the given columns. Will sort input and * output analogously. * * @param ascending Sort ascending or descending * @param columns An integer array containing column indicides */ public void sort(boolean ascending, int... columns) { checkRegistry(); registry.sort(this, ascending, columns); } /** * Sorts the dataset according to the given columns and the given range. * Will sort input and output analogously. * * @param from The lower bound * @param to The upper bound * @param ascending Sort ascending or descending * @param columns An integer array containing column indicides */ public void sort(int from, int to, boolean ascending, int... columns) { checkRegistry(); registry.sort(this, from, to, ascending, columns); } /** * Sorts the dataset according to the given columns. Will sort input and * output analogously. * * @param swapper A swapper * @param ascending Sort ascending or descending * @param columns An integer array containing column indicides */ public void sort(Swapper swapper, boolean ascending, int... columns) { checkRegistry(); registry.sort(this, swapper, ascending, columns); } /** * Sorts the dataset according to the given columns and the given range. * Will sort input and output analogously. * * @param swapper A swapper * @param from The lower bound * @param to The upper bound * @param ascending Sort ascending or descending * @param columns An integer array containing column indicides */ public void sort(Swapper swapper, int from, int to, boolean ascending, int... columns) { checkRegistry(); registry.sort(this, swapper, from, to, ascending, columns); } /** * Swaps both rows. * * @param row1 the row1 * @param row2 the row2 */ public void swap(int row1, int row2) { checkRegistry(); registry.swap(this, row1, row2); } /** * Checks a column index. * * @param column1 the column1 */ protected void checkColumn(final int column1) { if ((column1 < 0) || (column1 > (header.length - 1))) { throw new IndexOutOfBoundsException("Column index out of range: " + column1 + ". Valid: 0 - " + (header.length - 1)); } } /** * Checks the column indexes. * * @param columns the columns */ protected void checkColumns(final int[] columns) { // Check if ((columns.length == 0) || (columns.length > header.length)) { throw new IllegalArgumentException("Invalid number of column indices"); } // Create a sorted copy of the input columns final int[] cols = new int[columns.length]; System.arraycopy(columns, 0, cols, 0, cols.length); Arrays.sort(cols); // Check for (int i = 0; i < cols.length; i++) { checkColumn(cols[i]); if ((i > 0) && (cols[i] == cols[i - 1])) { throw new IllegalArgumentException("Duplicate column index"); } } } /** * Checks whether a registry is referenced. */ protected void checkRegistry() { if (registry == null) { throw new RuntimeException("This data handle (" + this.getClass().getSimpleName() + "@" + hashCode() + ") is orphaned"); } } /** * Checks a row index. * * @param row1 the row1 * @param length the length */ protected void checkRow(final int row1, final int length) { if ((row1 < 0) || (row1 > length)) { throw new IndexOutOfBoundsException("Row index (" + row1 + ") out of range (0 <= row <= " + length + ")"); } } /** * Releases all resources. */ protected abstract void doRelease(); /** * Returns the base data type without generalization. * * @param attribute the attribute * @return the base data type */ protected DataType<?> getBaseDataType(final String attribute) { checkRegistry(); return getRegistry().getBaseDataType(attribute); } /** * Returns the ARXConfiguration that is currently being used, null if this is an input handle * @return */ protected abstract ARXConfiguration getConfiguration(); /** * Generates an array of data types. * * @return the data type array */ protected abstract DataType<?>[][] getDataTypeArray(); /** * Returns the distinct values. * * @param column the column * @param ignoreSuppression * @param handler the handler * @return the distinct values */ protected abstract String[] getDistinctValues(int column, boolean ignoreSuppression, InterruptHandler handler); /** * Returns the registry associated with this handle. * * @return the registry */ protected DataRegistry getRegistry() { return registry; } /** * A negative integer, zero, or a positive integer as the first argument is * less than, equal to, or greater than the second. It uses the specified * data types for comparison. If no datatype is specified for a specific * column it uses string comparison. * * @param row1 the row1 * @param row2 the row2 * @param columns the columns * @param ascending the ascending * @return the int */ protected int internalCompare(final int row1, final int row2, final int[] columns, final boolean ascending) { checkRegistry(); try { for (int i = 0; i < columns.length; i++) { int index = columns[i]; int cmp = dataTypes[0][index].compare(internalGetValue(row1, index, false), internalGetValue(row2, index, false)); if (cmp != 0) { return ascending ? cmp : -cmp; } } return 0; } catch (final Exception e) { throw new RuntimeException(e); } } /** * Internal representation of get value. * * @param row the row * @param col the col * @return the string */ protected abstract String internalGetValue(int row, int col, boolean ignoreSuppression); /** * Internal replacement method. * * @param column the column * @param original the original * @param replacement the replacement * @return true, if successful */ protected abstract boolean internalReplace(int column, String original, String replacement); /** * Returns whether the data represented by this handle is anonymous * @return */ protected boolean isAnonymous() { return false; } /** * Updates the registry. * * @param registry the new registry */ protected void setRegistry(DataRegistry registry) { this.registry = registry; } /** * Sets the subset. * * @param handle the new view */ protected void setView(DataHandle handle) { subset = handle; } }