/* * ARX: Powerful Data Anonymization * Copyright 2012 - 2017 Fabian Prasser, Florian Kohlmayer and contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.deidentifier.arx.aggregates; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.ObjectInputStream; import java.io.Serializable; import java.util.HashSet; import java.util.Set; import org.deidentifier.arx.AttributeType.Hierarchy; import com.carrotsearch.hppc.CharOpenHashSet; /** * This class enables building hierarchies for categorical and non-categorical values * using redaction. Data items are 1) aligned left-to-right or right-to-left, 2) differences in * length are filled with a padding character, 3) then, equally long values are redacted character by character * from left-to-right or right-to-left. * * @author Fabian Prasser * @param <T> */ public class HierarchyBuilderRedactionBased<T> extends HierarchyBuilder<T> implements Serializable { /** * Order */ public static enum Order { /** TODO */ LEFT_TO_RIGHT, /** TODO */ RIGHT_TO_LEFT } /** TODO */ private static final long serialVersionUID = 3625654600380531803L; /** * Values are aligned left-to-right and redacted right-to-left. Redacted characters * are replaced with the given character. The same character is used for padding. * * @param <T> * @param redactionCharacter * @return */ public static <T> HierarchyBuilderRedactionBased<T> create(char redactionCharacter){ return new HierarchyBuilderRedactionBased<T>(redactionCharacter); } /** * Loads a builder specification from the given file. * * @param <T> * @param file * @return * @throws IOException */ @SuppressWarnings("unchecked") public static <T> HierarchyBuilderRedactionBased<T> create(File file) throws IOException{ ObjectInputStream ois = null; try { ois = new ObjectInputStream(new FileInputStream(file)); HierarchyBuilderRedactionBased<T> result = (HierarchyBuilderRedactionBased<T>)ois.readObject(); return result; } catch (Exception e) { throw new IOException(e); } finally { if (ois != null) ois.close(); } } /** * Values are aligned according to the alignmentOrder and redacted according to the redactionOrder. * Redacted characters are replaced with the given character. The same character is used for padding. * * @param <T> * @param alignmentOrder * @param redactionOrder * @param redactionCharacter * @return */ public static <T> HierarchyBuilderRedactionBased<T> create(Order alignmentOrder, Order redactionOrder, char redactionCharacter){ return new HierarchyBuilderRedactionBased<T>(alignmentOrder, redactionOrder, redactionCharacter); } /** * Values are aligned according to the alignmentOrder and redacted according to the redactionOrder. * Redacted characters are replaced with the given character. The padding character is used for padding. * * @param <T> * @param alignmentOrder * @param redactionOrder * @param paddingCharacter * @param redactionCharacter * @return */ public static <T> HierarchyBuilderRedactionBased<T> create(Order alignmentOrder, Order redactionOrder, char paddingCharacter, char redactionCharacter){ return new HierarchyBuilderRedactionBased<T>(alignmentOrder, redactionOrder, paddingCharacter, redactionCharacter); } /** * Loads a builder specification from the given file. * * @param <T> * @param file * @return * @throws IOException */ public static <T> HierarchyBuilderRedactionBased<T> create(String file) throws IOException{ return create(new File(file)); } /** Alignment order. */ private Order aligmentOrder = Order.LEFT_TO_RIGHT; /** Padding character. */ private char paddingCharacter = '*'; /** Redaction character. */ private char redactionCharacter = '*'; /** Redaction order. */ private Order redactionOrder = Order.RIGHT_TO_LEFT; /** Result. */ private transient String[][] result; /** * Meta-data about the nature of the domain of the attribute. Modeled as Double * for backwards compatibility */ private Double maxValueLength; /** * Meta-data about the nature of the domain of the attribute. Modeled as Double * for backwards compatibility */ private Double domainSize; /** * Meta-data about the nature of the domain of the attribute. Modeled as Double * for backwards compatibility */ private Double alphabetSize; /** * Values are aligned left-to-right and redacted right-to-left. Redacted characters * are replaced with the given character. The same character is used for padding. * @param redactionCharacter */ private HierarchyBuilderRedactionBased(char redactionCharacter){ super(Type.REDACTION_BASED); this.redactionCharacter = redactionCharacter; this.paddingCharacter = redactionCharacter; } /** * Values are aligned according to the alignmentOrder and redacted according to the redactionOrder. * Redacted characters are replaced with the given character. The same character is used for padding. * @param alignmentOrder * @param redactionOrder * @param redactionCharacter */ private HierarchyBuilderRedactionBased(Order alignmentOrder, Order redactionOrder, char redactionCharacter){ super(Type.REDACTION_BASED); this.redactionCharacter = redactionCharacter; this.paddingCharacter = redactionCharacter; this.aligmentOrder = alignmentOrder; this.redactionOrder = redactionOrder; } /** * Values are aligned according to the alignmentOrder and redacted according to the redactionOrder. * Redacted characters are replaced with the given character. The padding character is used for padding. * @param alignmentOrder * @param redactionOrder * @param paddingCharacter * @param redactionCharacter */ private HierarchyBuilderRedactionBased(Order alignmentOrder, Order redactionOrder, char paddingCharacter, char redactionCharacter){ super(Type.REDACTION_BASED); this.redactionCharacter = redactionCharacter; this.paddingCharacter = paddingCharacter; this.aligmentOrder = alignmentOrder; this.redactionOrder = redactionOrder; } /** * Creates a new hierarchy, based on the predefined specification. * * @return */ public Hierarchy build(){ // Check if (result == null) { throw new IllegalArgumentException("Please call prepare() first"); } // Return Hierarchy h = Hierarchy.create(result); this.result = null; return h; } /** * Creates a new hierarchy, based on the predefined specification. * * @param data * @return */ public Hierarchy build(String[] data){ prepare(data); return build(); } /** * Returns the alignment order. * * @return */ public Order getAligmentOrder() { return aligmentOrder; } /** * <p>Returns properties about the attribute's domain. Currently, this information is only used for * evaluating information loss with the generalized loss metric for attributes with functional * redaction-based hierarchies. May return <code>null</code>.</p> * @return Size of the alphabet: the possible number of elements per character of any value from the domain */ public Double getAlphabetSize() { return alphabetSize; } /** * <p>Returns properties about the attribute's domain. Currently, this information is only used for * evaluating information loss with the generalized loss metric for attributes with functional * redaction-based hierarchies. May return <code>null</code>.</p> * @return Size of the domain: the number of elements in the domain of the attribute */ public Double getDomainSize() { return domainSize; } /** * <p>Returns properties about the attribute's domain. Currently, this information is only used for * evaluating information loss with the generalized loss metric for attributes with functional * redaction-based hierarchies. May return <code>null</code>.</p> * * @return Max. length of an element: the number of characters of the largest element in the domain */ public Double getMaxValueLength() { return maxValueLength; } /** * Returns the padding character. * * @return */ public char getPaddingCharacter() { return paddingCharacter; } /** * Returns the redaction character. * * @return */ public char getRedactionCharacter() { return redactionCharacter; } /** * Returns the redaction order. * * @return */ public Order getRedactionOrder() { return redactionOrder; } /** * Returns whether domain-properties are available for this builder. Currently, this information is only used for * evaluating information loss with the generalized loss metric for attributes with functional * redaction-based hierarchies. * @return */ public boolean isDomainPropertiesAvailable() { return maxValueLength != null && domainSize != null && alphabetSize != null; } /** * Prepares the builder. Returns a list of the number of equivalence classes per level * * @param data * @return */ public int[] prepare(String[] data){ // Check if (this.result == null) { prepareResult(data); } // Compute int[] sizes = new int[this.result[0].length]; for (int i=0; i < sizes.length; i++){ Set<String> set = new HashSet<String>(); for (int j=0; j<this.result.length; j++) { set.add(result[j][i]); } sizes[i] = set.size(); } // Return return sizes; } /** * <p>Sets properties about the attribute's domain. Currently, this information is only used for * evaluating information loss with the generalized loss metric for attributes with functional * redaction-based hierarchies. Required properties are:</p> * <ul> * <li>Size of the domain: the number of elements in the domain of the attribute</li> * <li>Size of the alphabet: the possible number of elements per character of any value from the domain</li> * <li>Max. length of an element: the number of characters of the largest element in the domain</li> * </ul> * <p>As a simplifying assumption, it is assumed that the domain values are distributed equally regarding * their length and their characters from the alphabet.</p> * <p>This method will estimate the size of the domain as * domainSize = alphabetSize^{maxValueLength}</p> * * * @param alphabetSize * @param maxValueLength */ public void setAlphabetSize(int alphabetSize, int maxValueLength){ this.domainSize = Math.pow((double)alphabetSize, (double)maxValueLength); this.maxValueLength = Double.valueOf(maxValueLength); this.alphabetSize = Double.valueOf(alphabetSize); } /** * <p>Sets properties about the attribute's domain. Currently, this information is only used for * evaluating information loss with the generalized loss metric for attributes with functional * redaction-based hierarchies. Required properties are:</p> * <ul> * <li>Size of the domain: the number of elements in the domain of the attribute</li> * <li>Size of the alphabet: the possible number of elements per character of any value from the domain</li> * <li>Max. length of an element: the number of characters of the largest element in the domain</li> * </ul> * * @param domainSize * @param alphabetSize * @param maxValueLength */ public void setDomainAndAlphabetSize(int domainSize, int alphabetSize, int maxValueLength){ this.domainSize = Double.valueOf(domainSize); this.maxValueLength = Double.valueOf(maxValueLength); this.alphabetSize = Double.valueOf(alphabetSize); } /** * <p>Sets properties about the attribute's domain. Currently, this information is only used for * evaluating information loss with the generalized loss metric for attributes with functional * redaction-based hierarchies.</p> * * @param data */ public void setDomainMetadata(String[] data) { CharOpenHashSet characterSet = new CharOpenHashSet(); this.maxValueLength = 0d; for (int i = 0; i < data.length; i++) { String value = data[i]; this.maxValueLength = Math.max(this.maxValueLength, value.length()); char[] charArray = value.toCharArray(); for (int j = 0; j < charArray.length; j++) { characterSet.add(charArray[j]); } } this.domainSize = (double)data.length; this.alphabetSize = (double)characterSet.size(); } /** * <p>Sets properties about the attribute's domain. Currently, this information is only used for * evaluating information loss with the generalized loss metric for attributes with functional * redaction-based hierarchies. Required properties are:</p> * <ul> * <li>Size of the domain: the number of elements in the domain of the attribute</li> * <li>Size of the alphabet: the possible number of elements per character of any value from the domain</li> * <li>Max. length of an element: the number of characters of the largest element in the domain</li> * </ul> * <p>As a simplifying assumption, it is assumed that the domain values are distributed equally regarding * their length and their characters from the alphabet.</p> * <p>This method will estimate the size of the alphabet as * alphabetSize = pow(domainSize, 1.0d / maxValueLength)</p> * * @param domainSize * @param maxValueLength */ public void setDomainSize(int domainSize, int maxValueLength){ this.domainSize = Double.valueOf(domainSize); this.maxValueLength = Double.valueOf(maxValueLength); this.alphabetSize = Math.pow(domainSize, 1.0d / (double)maxValueLength); } /** * Computes the hierarchy. * * @param data */ private void prepareResult(String[] data){ // Determine length int length = Integer.MIN_VALUE; for (String s : data) { length = Math.max(length, s.length()); } // Build padding string StringBuilder paddingBuilder = new StringBuilder(); for (int i=0; i<length; i++) paddingBuilder.append(paddingCharacter); String padding = paddingBuilder.toString(); // Build list of base strings String[] base = new String[data.length]; for (int i=0; i<data.length; i++) { if (data[i].length()<length) { String pad = padding.substring(0, length - data[i].length()); if (aligmentOrder == Order.RIGHT_TO_LEFT) { base[i] = pad + data[i]; } else { base[i] = data[i] + pad; } } else { base[i] = data[i]; } } // Build padding string StringBuilder redactionBuilder = new StringBuilder(); for (int i=0; i<length; i++) redactionBuilder.append(redactionCharacter); String redaction = redactionBuilder.toString(); // Build result this.result = new String[base.length][length + 1]; for (int i=0; i<base.length; i++){ result[i] = new String[length + 1]; result[i][0] = data[i]; for (int j=1; j<length + 1; j++){ String redact = redaction.substring(0, j); if (redactionOrder == Order.RIGHT_TO_LEFT) { result[i][j] = base[i].substring(0, length - j) + redact; } else { result[i][j] = redact + base[i].substring(0, length - j); } } } } }