/*
* ARX: Powerful Data Anonymization
* Copyright 2012 - 2017 Fabian Prasser, Florian Kohlmayer and contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.deidentifier.arx;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.carrotsearch.hppc.IntArrayList;
/**
* This class represents a the dataset that is to be de-identified
* as a subset of the given population table.
*
* @author Fabian Prasser
* @author Florian Kohlmayer
*/
public class DataSubset implements Serializable {
/**
* Wrapper around a string array.
*
* @author Fabian Prasser
* @author Florian Kohlmayer
*/
private static class Entry implements Serializable {
/** SVUID */
private static final long serialVersionUID = 31695068160887476L;
/** Record */
private String[] data;
/** Hashcode */
private int hashcode;
/**
*
*
* @param data
*/
public Entry(String[] data){
this.data = data;
this.hashcode = Arrays.hashCode(data);
}
@Override
public boolean equals(Object obj) {
if (obj == null) return false;
Entry other = (Entry) obj;
return Arrays.equals(data, other.data);
}
@Override
public int hashCode() {
return hashcode;
}
}
/** SVUID */
private static final long serialVersionUID = 3945730896172205344L;
/**
* Create a subset by matching two data instances.
*
* @param data
* @param subset
* @return
*/
public static DataSubset create(Data data, Data subset){
// TODO: Implement more efficiently
DataHandle bHandle = data.getHandle();
DataHandle sHandle = subset.getHandle();
// Add background data to map
Map<Entry, List<Integer>> background = new HashMap<Entry, List<Integer>>();
for (int i=0; i<bHandle.getNumRows(); i++){
String[] tuple = new String[bHandle.getNumColumns()];
for (int j=0; j<tuple.length; j++){
tuple[j] = bHandle.getValue(i, j);
}
Entry entry = new Entry(tuple);
if (!background.containsKey(entry)) {
background.put(entry, new ArrayList<Integer>());
}
background.get(entry).add(i);
}
// Init
RowSet bitset = RowSet.create(data);
int[] array = new int[sHandle.getNumRows()];
int idx = 0;
// Match subset
for (int i=0; i<sHandle.getNumRows(); i++){
String[] tuple = new String[sHandle.getNumColumns()];
for (int j=0; j<tuple.length; j++){
tuple[j] = sHandle.getValue(i, j);
}
List<Integer> indices = background.get(new Entry(tuple));
if (indices == null) {
throw new IllegalArgumentException("No match found for: "+Arrays.toString(tuple));
}
if (indices.isEmpty()) {
throw new IllegalArgumentException("Too many matches found for: "+Arrays.toString(tuple));
}
int index = indices.remove(0);
bitset.add(index);
array[idx++] = index;
}
// Return
Arrays.sort(array);
return new DataSubset(bitset, array);
}
/**
* Creates a subset from the given selector.
*
* @param data
* @param selector
* @return
*/
public static DataSubset create(Data data, DataSelector selector){
// Init
int rows = data.getHandle().getNumRows();
RowSet bitset = RowSet.create(data);
ArrayList<Integer> list = new ArrayList<Integer>();
// Check
for (int i=0; i<rows; i++){
if (selector.isSelected(i)) {
bitset.add(i);
list.add(i);
}
}
// Convert
int[] array = new int[list.size()];
for (int i=0; i<list.size(); i++){
array[i] = list.get(i);
}
// Return
return new DataSubset(bitset, array);
}
/**
* Creates a new subset from the given row set, from which a copy is created.
*
* @param data
* @param subset
* @return
*/
public static DataSubset create(Data data, RowSet subset) {
return create(data.getHandle().getNumRows(), subset);
}
/**
* Creates a new subset from the given set of tuple indices.
*
* @param data
* @param subset
* @return
*/
public static DataSubset create(Data data, Set<Integer> subset){
return create(data.getHandle().getNumRows(), subset);
}
/**
* Creates a new subset from the given row set, from which a copy is created.
*
* @param data
* @param subset
* @return
*/
public static DataSubset create(int rows, RowSet subset) {
RowSet bitset = RowSet.create(rows);
int[] array = new int[subset.size()];
int idx = 0;
for (int i=0; i<rows; i++){
if (subset.contains(i)) {
bitset.add(i);
array[idx++]=i;
}
}
return new DataSubset(bitset, array);
}
/**
* Creates a new subset from the given set of tuple indices.
*
* @param rows
* @param subset
* @return
*/
public static DataSubset create(int rows, Set<Integer> subset){
RowSet bitset = RowSet.create(rows);
int[] array = new int[subset.size()];
int idx = 0;
for (Integer line : subset) {
if (line < 0 || line >= rows) {
throw new IllegalArgumentException("Subset index out of range!");
}
bitset.add(line);
array[idx++] = line;
}
Arrays.sort(array);
return new DataSubset(bitset, array);
}
/** The subset as a bitset. */
protected RowSet set;
/** The subset as a sorted array of indices. */
protected int[] array;
/**
* Creates a new instance.
*
* @param bitSet
* @param sortedIndices
*/
private DataSubset(RowSet bitSet, int[] sortedIndices) {
this.set = bitSet;
this.array = sortedIndices;
}
/**
* Clone
*/
public DataSubset clone() {
return new DataSubset(this.set.clone(), Arrays.copyOf(this.array, this.array.length));
}
/**
* Getter
*
* @return
*/
public int[] getArray() {
return array;
}
/**
* Getter
*
* @return
*/
public RowSet getSet() {
return set;
}
/**
* Returns the size of the data subset
* @return
*/
public int getSize() {
return array.length;
}
/**
* Returns a new data subset, only containing those rows that are included in the subset
* @param rowset
* @return
*/
protected DataSubset getSubsetInstance(RowSet rowset) {
int index = -1;
RowSet newset = RowSet.create(rowset.size());
IntArrayList list = new IntArrayList();
for (int row = 0; row < this.set.length(); row++) {
if (rowset.contains(row)) {
index++;
if (this.set.contains(row)) {
newset.add(index);
list.add(index);
}
}
}
return new DataSubset(newset, list.toArray());
}
}